diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,78532 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.99974206860975,
+  "eval_steps": 500,
+  "global_step": 2907,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1169.0,
+      "completions/max_terminated_length": 1169.0,
+      "completions/mean_length": 494.0625305175781,
+      "completions/mean_terminated_length": 494.0625305175781,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.0010317255610007739,
+      "grad_norm": 1.318339467048645,
+      "kl": 0.0,
+      "learning_rate": 1e-06,
+      "loss": 0.0159,
+      "num_tokens": 116746.0,
+      "reward": 0.28214287757873535,
+      "reward_std": 0.4351659417152405,
+      "rewards/code_format_reward/mean": 0.2321428507566452,
+      "rewards/code_format_reward/std": 0.4240971803665161,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806,
+      "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3506.0,
+      "completions/max_terminated_length": 3506.0,
+      "completions/mean_length": 516.8660888671875,
+      "completions/mean_terminated_length": 516.8660888671875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 0.0020634511220015478,
+      "grad_norm": 1.158948302268982,
+      "kl": 0.0006999969482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 238349.0,
+      "reward": 0.5758929252624512,
+      "reward_std": 0.47179263830184937,
+      "rewards/code_format_reward/mean": 0.4821428656578064,
+      "rewards/code_format_reward/std": 0.5019267797470093,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1131.0,
+      "completions/max_terminated_length": 1131.0,
+      "completions/mean_length": 424.33929443359375,
+      "completions/mean_terminated_length": 424.33929443359375,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 0.003095176683002321,
+      "grad_norm": 1.1996846199035645,
+      "kl": 0.004955291748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0258,
+      "num_tokens": 356799.0,
+      "reward": 0.8459821939468384,
+      "reward_std": 0.3950677216053009,
+      "rewards/code_format_reward/mean": 0.7678571343421936,
+      "rewards/code_format_reward/std": 0.4240972101688385,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1900.0,
+      "completions/max_terminated_length": 1900.0,
+      "completions/mean_length": 483.3482360839844,
+      "completions/mean_terminated_length": 483.3482360839844,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.0041269022440030955,
+      "grad_norm": 0.8876407146453857,
+      "kl": 0.00455474853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0041,
+      "num_tokens": 481704.0,
+      "reward": 0.9250000715255737,
+      "reward_std": 0.2708400785923004,
+      "rewards/code_format_reward/mean": 0.875,
+      "rewards/code_format_reward/std": 0.33220529556274414,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806,
+      "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 990.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 445.794677734375,
+      "completions/mean_terminated_length": 445.794677734375,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 0.005158627805003869,
+      "grad_norm": 0.6923364996910095,
+      "kl": 0.00545501708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 585043.0,
+      "reward": 1.0,
+      "reward_std": 0.17492596805095673,
+      "rewards/code_format_reward/mean": 0.9375,
+      "rewards/code_format_reward/std": 0.24314938485622406,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0625,
+      "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 904.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 486.4464416503906,
+      "completions/mean_terminated_length": 486.4464416503906,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.006190353366004642,
+      "grad_norm": 0.5436791777610779,
+      "kl": 0.0048828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 708685.0,
+      "reward": 1.0040178298950195,
+      "reward_std": 0.0781477838754654,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.02187499962747097,
+      "rewards/curriculum_aware_reward_fn/std": 0.08510228246450424,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 685.0,
+      "completions/max_terminated_length": 685.0,
+      "completions/mean_length": 429.01788330078125,
+      "completions/mean_terminated_length": 429.01788330078125,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 0.007222078927005417,
+      "grad_norm": 0.7159688472747803,
+      "kl": 0.00606536865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0251,
+      "num_tokens": 819672.0,
+      "reward": 1.0669643878936768,
+      "reward_std": 0.12266332656145096,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 926.0,
+      "completions/max_terminated_length": 926.0,
+      "completions/mean_length": 455.6339416503906,
+      "completions/mean_terminated_length": 455.6339416503906,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.008253804488006191,
+      "grad_norm": 0.6650537252426147,
+      "kl": 0.006988525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 934536.0,
+      "reward": 1.0316965579986572,
+      "reward_std": 0.08898404985666275,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.04062500223517418,
+      "rewards/curriculum_aware_reward_fn/std": 0.11261255294084549,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1310.0,
+      "completions/max_terminated_length": 1310.0,
+      "completions/mean_length": 413.3035888671875,
+      "completions/mean_terminated_length": 413.3035888671875,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.009285530049006964,
+      "grad_norm": 0.743399441242218,
+      "kl": 0.00730133056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0313,
+      "num_tokens": 1041565.0,
+      "reward": 1.1218750476837158,
+      "reward_std": 0.10294599086046219,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1143.0,
+      "completions/max_terminated_length": 1143.0,
+      "completions/mean_length": 436.4285888671875,
+      "completions/mean_terminated_length": 436.4285888671875,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.010317255610007738,
+      "grad_norm": 0.6795259118080139,
+      "kl": 0.007244110107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0189,
+      "num_tokens": 1154796.0,
+      "reward": 1.1071429252624512,
+      "reward_std": 0.11945624649524689,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 934.0,
+      "completions/max_terminated_length": 934.0,
+      "completions/mean_length": 481.3214416503906,
+      "completions/mean_terminated_length": 481.3214416503906,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.011348981171008512,
+      "grad_norm": 0.5997210741043091,
+      "kl": 0.005950927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0182,
+      "num_tokens": 1270556.0,
+      "reward": 1.054464340209961,
+      "reward_std": 0.1270672082901001,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843402802944183,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 764.0,
+      "completions/max_terminated_length": 764.0,
+      "completions/mean_length": 443.15179443359375,
+      "completions/mean_terminated_length": 443.15179443359375,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.012380706732009285,
+      "grad_norm": 0.6722776293754578,
+      "kl": 0.00591278076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0041,
+      "num_tokens": 1385646.0,
+      "reward": 1.0593750476837158,
+      "reward_std": 0.08607304841279984,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1456.0,
+      "completions/max_terminated_length": 1456.0,
+      "completions/mean_length": 550.3125,
+      "completions/mean_terminated_length": 550.3125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.013412432293010059,
+      "grad_norm": 0.6148671507835388,
+      "kl": 0.00577545166015625,
+      "learning_rate": 1e-06,
+      "loss": -0.001,
+      "num_tokens": 1514299.0,
+      "reward": 1.0660715103149414,
+      "reward_std": 0.10993208736181259,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1899.0,
+      "completions/max_terminated_length": 1899.0,
+      "completions/mean_length": 564.6339721679688,
+      "completions/mean_terminated_length": 564.6339721679688,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.014444157854010833,
+      "grad_norm": 0.6636642813682556,
+      "kl": 0.0090484619140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0041,
+      "num_tokens": 1654104.0,
+      "reward": 0.9424107670783997,
+      "reward_std": 0.23124831914901733,
+      "rewards/code_format_reward/mean": 0.9017857313156128,
+      "rewards/code_format_reward/std": 0.2989417314529419,
+      "rewards/curriculum_aware_reward_fn/mean": 0.04062500223517418,
+      "rewards/curriculum_aware_reward_fn/std": 0.11261255294084549,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 467.96429443359375,
+      "completions/mean_terminated_length": 467.96429443359375,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.015475883415011608,
+      "grad_norm": 0.542773962020874,
+      "kl": 0.00925445556640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 1775382.0,
+      "reward": 1.041517972946167,
+      "reward_std": 0.07933254539966583,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 936.0,
+      "completions/max_terminated_length": 936.0,
+      "completions/mean_length": 417.71429443359375,
+      "completions/mean_terminated_length": 417.71429443359375,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 0.016507608976012382,
+      "grad_norm": 0.6454997062683105,
+      "kl": 0.0098419189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0077,
+      "num_tokens": 1889737.0,
+      "reward": 1.0732142925262451,
+      "reward_std": 0.13696174323558807,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1122.0,
+      "completions/max_terminated_length": 1122.0,
+      "completions/mean_length": 518.25,
+      "completions/mean_terminated_length": 518.25,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.017539334537013153,
+      "grad_norm": 0.49701130390167236,
+      "kl": 0.00626373291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0162,
+      "num_tokens": 2026720.0,
+      "reward": 1.056249976158142,
+      "reward_std": 0.06265628337860107,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1654.0,
+      "completions/max_terminated_length": 1654.0,
+      "completions/mean_length": 569.5,
+      "completions/mean_terminated_length": 569.5,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.018571060098013927,
+      "grad_norm": 0.4273010492324829,
+      "kl": 0.00580596923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0016,
+      "num_tokens": 2159447.0,
+      "reward": 0.9883929491043091,
+      "reward_std": 0.0637814998626709,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0062500000931322575,
+      "rewards/curriculum_aware_reward_fn/std": 0.04655956104397774,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1708.0,
+      "completions/max_terminated_length": 1708.0,
+      "completions/mean_length": 495.89288330078125,
+      "completions/mean_terminated_length": 495.89288330078125,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.0196027856590147,
+      "grad_norm": 0.5998135805130005,
+      "kl": 0.0064849853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0182,
+      "num_tokens": 2282838.0,
+      "reward": 1.0848214626312256,
+      "reward_std": 0.10725849866867065,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1220.0,
+      "completions/max_terminated_length": 1220.0,
+      "completions/mean_length": 477.0714416503906,
+      "completions/mean_terminated_length": 477.0714416503906,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.020634511220015476,
+      "grad_norm": 0.7616762518882751,
+      "kl": 0.00719451904296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0129,
+      "num_tokens": 2399480.0,
+      "reward": 1.079017996788025,
+      "reward_std": 0.15464720129966736,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.17709888517856598,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 884.0,
+      "completions/max_terminated_length": 884.0,
+      "completions/mean_length": 448.40179443359375,
+      "completions/mean_terminated_length": 448.40179443359375,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.02166623678101625,
+      "grad_norm": 0.5755221843719482,
+      "kl": 0.00826263427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0201,
+      "num_tokens": 2516544.0,
+      "reward": 1.1062500476837158,
+      "reward_std": 0.06990548223257065,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2976.0,
+      "completions/max_terminated_length": 2976.0,
+      "completions/mean_length": 522.8214721679688,
+      "completions/mean_terminated_length": 522.8214721679688,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 0.022697962342017024,
+      "grad_norm": 0.6775954961776733,
+      "kl": 0.007049560546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 2646255.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.10498352348804474,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1326.0,
+      "completions/max_terminated_length": 1326.0,
+      "completions/mean_length": 466.6607360839844,
+      "completions/mean_terminated_length": 466.6607360839844,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.0237296879030178,
+      "grad_norm": 0.6193450093269348,
+      "kl": 0.00760650634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 2770201.0,
+      "reward": 1.1125001907348633,
+      "reward_std": 0.07438036799430847,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1035.0,
+      "completions/max_terminated_length": 1035.0,
+      "completions/mean_length": 439.4285888671875,
+      "completions/mean_terminated_length": 439.4285888671875,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.02476141346401857,
+      "grad_norm": 0.5675091743469238,
+      "kl": 0.00740814208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0062,
+      "num_tokens": 2885042.0,
+      "reward": 1.1062500476837158,
+      "reward_std": 0.06854972243309021,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1473.0,
+      "completions/max_terminated_length": 1473.0,
+      "completions/mean_length": 513.9375,
+      "completions/mean_terminated_length": 513.9375,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 0.025793139025019344,
+      "grad_norm": 0.6651517152786255,
+      "kl": 0.0066986083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 3012092.0,
+      "reward": 1.0848215818405151,
+      "reward_std": 0.11312052607536316,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 760.0,
+      "completions/max_terminated_length": 760.0,
+      "completions/mean_length": 467.5000305175781,
+      "completions/mean_terminated_length": 467.5000305175781,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.026824864586020118,
+      "grad_norm": 0.46935516595840454,
+      "kl": 0.00611114501953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 3133073.0,
+      "reward": 1.0593750476837158,
+      "reward_std": 0.04615173488855362,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 710.0,
+      "completions/max_terminated_length": 710.0,
+      "completions/mean_length": 427.3214416503906,
+      "completions/mean_terminated_length": 427.3214416503906,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 0.027856590147020893,
+      "grad_norm": 0.7585523724555969,
+      "kl": 0.00814056396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0275,
+      "num_tokens": 3249883.0,
+      "reward": 1.125000238418579,
+      "reward_std": 0.09292245656251907,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 820.0,
+      "completions/max_terminated_length": 820.0,
+      "completions/mean_length": 489.70538330078125,
+      "completions/mean_terminated_length": 489.70538330078125,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.028888315708021667,
+      "grad_norm": 0.4714300334453583,
+      "kl": 0.00653076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0087,
+      "num_tokens": 3378412.0,
+      "reward": 1.053125023841858,
+      "reward_std": 0.04717051237821579,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582,
+      "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1270.0,
+      "completions/max_terminated_length": 1270.0,
+      "completions/mean_length": 452.8035888671875,
+      "completions/mean_terminated_length": 452.8035888671875,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.02992004126902244,
+      "grad_norm": 0.6367431282997131,
+      "kl": 0.00838470458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 3495220.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.08884736895561218,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 941.0,
+      "completions/max_terminated_length": 941.0,
+      "completions/mean_length": 449.76788330078125,
+      "completions/mean_terminated_length": 449.76788330078125,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 0.030951766830023215,
+      "grad_norm": 0.6237981915473938,
+      "kl": 0.009368896484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 3608552.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.08083000034093857,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1214.0,
+      "completions/max_terminated_length": 1214.0,
+      "completions/mean_length": 486.21429443359375,
+      "completions/mean_terminated_length": 486.21429443359375,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.031983492391023986,
+      "grad_norm": 0.5921357274055481,
+      "kl": 0.0074462890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0012,
+      "num_tokens": 3732091.0,
+      "reward": 1.0750000476837158,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 461.357177734375,
+      "completions/mean_terminated_length": 461.357177734375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 0.033015217952024764,
+      "grad_norm": 0.698621392250061,
+      "kl": 0.00798797607421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 3848961.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.08847898989915848,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1045.0,
+      "completions/max_terminated_length": 1045.0,
+      "completions/mean_length": 456.732177734375,
+      "completions/mean_terminated_length": 456.732177734375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.034046943513025535,
+      "grad_norm": 0.6826543211936951,
+      "kl": 0.008026123046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0,
+      "num_tokens": 3961183.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.08746020495891571,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1612.0,
+      "completions/max_terminated_length": 1612.0,
+      "completions/mean_length": 492.232177734375,
+      "completions/mean_terminated_length": 492.232177734375,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.035078669074026306,
+      "grad_norm": 0.5144251585006714,
+      "kl": 0.00841522216796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0074,
+      "num_tokens": 4077941.0,
+      "reward": 1.0687501430511475,
+      "reward_std": 0.06228790059685707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06874999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 500.1785888671875,
+      "completions/mean_terminated_length": 500.1785888671875,
+      "completions/min_length": 121.0,
+      "completions/min_terminated_length": 121.0,
+      "epoch": 0.036110394635027084,
+      "grad_norm": 0.4933493733406067,
+      "kl": 0.009246826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 4199754.0,
+      "reward": 1.068750023841858,
+      "reward_std": 0.05096360296010971,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 866.0,
+      "completions/max_terminated_length": 866.0,
+      "completions/mean_length": 478.58038330078125,
+      "completions/mean_terminated_length": 478.58038330078125,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 0.037142120196027854,
+      "grad_norm": 0.5227413177490234,
+      "kl": 0.00762176513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0116,
+      "num_tokens": 4326365.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.07092425972223282,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 823.0,
+      "completions/max_terminated_length": 823.0,
+      "completions/mean_length": 412.9821472167969,
+      "completions/mean_terminated_length": 412.9821472167969,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.03817384575702863,
+      "grad_norm": 0.6366561651229858,
+      "kl": 0.01012420654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 4444495.0,
+      "reward": 1.0937501192092896,
+      "reward_std": 0.08746020495891571,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 804.0,
+      "completions/max_terminated_length": 804.0,
+      "completions/mean_length": 428.6875305175781,
+      "completions/mean_terminated_length": 428.6875305175781,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.0392055713180294,
+      "grad_norm": 0.7173018455505371,
+      "kl": 0.01025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 4551553.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.08847897499799728,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 915.0,
+      "completions/max_terminated_length": 915.0,
+      "completions/mean_length": 469.8125305175781,
+      "completions/mean_terminated_length": 469.8125305175781,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.04023729687903018,
+      "grad_norm": 0.679751992225647,
+      "kl": 0.0091400146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0217,
+      "num_tokens": 4676041.0,
+      "reward": 1.0656250715255737,
+      "reward_std": 0.08363571017980576,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.13722330331802368,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 915.0,
+      "completions/max_terminated_length": 915.0,
+      "completions/mean_length": 497.5000305175781,
+      "completions/mean_terminated_length": 497.5000305175781,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.04126902244003095,
+      "grad_norm": 0.5930312871932983,
+      "kl": 0.00982666015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0113,
+      "num_tokens": 4799416.0,
+      "reward": 1.0531251430511475,
+      "reward_std": 0.07641790807247162,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582,
+      "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 818.0,
+      "completions/max_terminated_length": 818.0,
+      "completions/mean_length": 410.6607360839844,
+      "completions/mean_terminated_length": 410.6607360839844,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.04230074800103172,
+      "grad_norm": 0.7011821866035461,
+      "kl": 0.0117645263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 4907101.0,
+      "reward": 1.109375,
+      "reward_std": 0.09430962055921555,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1027.0,
+      "completions/max_terminated_length": 1027.0,
+      "completions/mean_length": 432.2589416503906,
+      "completions/mean_terminated_length": 432.2589416503906,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.0433324735620325,
+      "grad_norm": 0.40348029136657715,
+      "kl": 0.0109710693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 5025192.0,
+      "reward": 1.0687501430511475,
+      "reward_std": 0.022366588935256004,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1081.0,
+      "completions/max_terminated_length": 1081.0,
+      "completions/mean_length": 409.8571472167969,
+      "completions/mean_terminated_length": 409.8571472167969,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.04436419912303327,
+      "grad_norm": 0.7503907680511475,
+      "kl": 0.0126495361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 5140459.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.09394122660160065,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1031.0,
+      "completions/max_terminated_length": 1031.0,
+      "completions/mean_length": 426.8750305175781,
+      "completions/mean_terminated_length": 426.8750305175781,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.04539592468403405,
+      "grad_norm": 0.5602256655693054,
+      "kl": 0.011505126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0017,
+      "num_tokens": 5252585.0,
+      "reward": 1.0723215341567993,
+      "reward_std": 0.07662393152713776,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 662.0,
+      "completions/max_terminated_length": 662.0,
+      "completions/mean_length": 401.7946472167969,
+      "completions/mean_terminated_length": 401.7946472167969,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.04642765024503482,
+      "grad_norm": 0.6367837190628052,
+      "kl": 0.011199951171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0068,
+      "num_tokens": 5355728.0,
+      "reward": 1.078125,
+      "reward_std": 0.07434897124767303,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 716.0,
+      "completions/max_terminated_length": 716.0,
+      "completions/mean_length": 388.1607360839844,
+      "completions/mean_terminated_length": 388.1607360839844,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.0474593758060356,
+      "grad_norm": 0.6154026389122009,
+      "kl": 0.014129638671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0015,
+      "num_tokens": 5469559.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.05784441903233528,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1075.0,
+      "completions/max_terminated_length": 1075.0,
+      "completions/mean_length": 408.83038330078125,
+      "completions/mean_terminated_length": 408.83038330078125,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 0.04849110136703637,
+      "grad_norm": 0.7130938768386841,
+      "kl": 0.0154571533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0225,
+      "num_tokens": 5587539.0,
+      "reward": 1.1125000715255737,
+      "reward_std": 0.07780507206916809,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11249999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 958.0,
+      "completions/max_terminated_length": 958.0,
+      "completions/mean_length": 429.2410888671875,
+      "completions/mean_terminated_length": 429.2410888671875,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 0.04952282692803714,
+      "grad_norm": 0.7699652314186096,
+      "kl": 0.015350341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0145,
+      "num_tokens": 5702863.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.08709181845188141,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1635.0,
+      "completions/max_terminated_length": 1635.0,
+      "completions/mean_length": 436.9464416503906,
+      "completions/mean_terminated_length": 436.9464416503906,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.05055455248903792,
+      "grad_norm": 0.4649677574634552,
+      "kl": 0.0119171142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0095,
+      "num_tokens": 5813534.0,
+      "reward": 1.03125,
+      "reward_std": 0.04232724383473396,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.03125,
+      "rewards/curriculum_aware_reward_fn/std": 0.10025305300951004,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 836.0,
+      "completions/max_terminated_length": 836.0,
+      "completions/mean_length": 442.58929443359375,
+      "completions/mean_terminated_length": 442.58929443359375,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 0.05158627805003869,
+      "grad_norm": 0.5999484062194824,
+      "kl": 0.0105743408203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0132,
+      "num_tokens": 5941763.0,
+      "reward": 1.0625,
+      "reward_std": 0.06025035306811333,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0625,
+      "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 660.0,
+      "completions/max_terminated_length": 660.0,
+      "completions/mean_length": 379.1964416503906,
+      "completions/mean_terminated_length": 379.1964416503906,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 0.052618003611039466,
+      "grad_norm": 0.43735066056251526,
+      "kl": 0.013519287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 6050796.0,
+      "reward": 1.068750023841858,
+      "reward_std": 0.03890253230929375,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 413.6785888671875,
+      "completions/mean_terminated_length": 413.6785888671875,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.053649729172040236,
+      "grad_norm": 0.7055557370185852,
+      "kl": 0.0141754150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0167,
+      "num_tokens": 6158672.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.07678630203008652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 808.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 438.7321472167969,
+      "completions/mean_terminated_length": 438.7321472167969,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.054681454733041014,
+      "grad_norm": 0.5913369655609131,
+      "kl": 0.0139923095703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0031,
+      "num_tokens": 6282654.0,
+      "reward": 1.0531251430511475,
+      "reward_std": 0.06268768012523651,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582,
+      "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 742.0,
+      "completions/max_terminated_length": 742.0,
+      "completions/mean_length": 371.8750305175781,
+      "completions/mean_terminated_length": 371.8750305175781,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.055713180294041785,
+      "grad_norm": 0.5662503242492676,
+      "kl": 0.016754150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0286,
+      "num_tokens": 6394360.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.07777366787195206,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 750.0,
+      "completions/max_terminated_length": 750.0,
+      "completions/mean_length": 375.9821472167969,
+      "completions/mean_terminated_length": 375.9821472167969,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 0.056744905855042556,
+      "grad_norm": 0.7322971224784851,
+      "kl": 0.0171356201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0061,
+      "num_tokens": 6508633.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.09230346977710724,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 925.0,
+      "completions/max_terminated_length": 925.0,
+      "completions/mean_length": 379.1964416503906,
+      "completions/mean_terminated_length": 379.1964416503906,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 0.057776631416043334,
+      "grad_norm": 0.5264835953712463,
+      "kl": 0.01763916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 6619143.0,
+      "reward": 1.1187502145767212,
+      "reward_std": 0.0671311616897583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.19114695489406586,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 748.0,
+      "completions/max_terminated_length": 748.0,
+      "completions/mean_length": 414.58929443359375,
+      "completions/mean_terminated_length": 414.58929443359375,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.058808356977044104,
+      "grad_norm": 0.4410315752029419,
+      "kl": 0.0155792236328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0135,
+      "num_tokens": 6727877.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.03165333718061447,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 656.0,
+      "completions/max_terminated_length": 656.0,
+      "completions/mean_length": 364.95538330078125,
+      "completions/mean_terminated_length": 364.95538330078125,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "epoch": 0.05984008253804488,
+      "grad_norm": 0.6999755501747131,
+      "kl": 0.01708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 6832991.0,
+      "reward": 1.0906251668930054,
+      "reward_std": 0.07984261959791183,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 793.0,
+      "completions/max_terminated_length": 793.0,
+      "completions/mean_length": 381.3839416503906,
+      "completions/mean_terminated_length": 381.3839416503906,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 0.06087180809904565,
+      "grad_norm": 0.49779650568962097,
+      "kl": 0.0159149169921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 6942577.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.05744463577866554,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 693.0,
+      "completions/max_terminated_length": 693.0,
+      "completions/mean_length": 346.0000305175781,
+      "completions/mean_terminated_length": 346.0000305175781,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 0.06190353366004643,
+      "grad_norm": 0.6053861379623413,
+      "kl": 0.017669677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0108,
+      "num_tokens": 7042495.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 848.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 426.1875305175781,
+      "completions/mean_terminated_length": 426.1875305175781,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 0.0629352592210472,
+      "grad_norm": 0.49604561924934387,
+      "kl": 0.01611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 7159248.0,
+      "reward": 1.109375,
+      "reward_std": 0.05300115421414375,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 766.0,
+      "completions/max_terminated_length": 766.0,
+      "completions/mean_length": 386.40179443359375,
+      "completions/mean_terminated_length": 386.40179443359375,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 0.06396698478204797,
+      "grad_norm": 0.7982628345489502,
+      "kl": 0.019561767578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0028,
+      "num_tokens": 7274324.0,
+      "reward": 1.093750238418579,
+      "reward_std": 0.09779711812734604,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1083.0,
+      "completions/max_terminated_length": 1083.0,
+      "completions/mean_length": 393.02679443359375,
+      "completions/mean_terminated_length": 393.02679443359375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 0.06499871034304874,
+      "grad_norm": 0.703448474407196,
+      "kl": 0.01800537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0075,
+      "num_tokens": 7392375.0,
+      "reward": 1.0937501192092896,
+      "reward_std": 0.07157464325428009,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 642.0,
+      "completions/max_terminated_length": 642.0,
+      "completions/mean_length": 369.0357360839844,
+      "completions/mean_terminated_length": 369.0357360839844,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.06603043590404953,
+      "grad_norm": 0.7088890075683594,
+      "kl": 0.01617431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0169,
+      "num_tokens": 7503448.0,
+      "reward": 1.0906251668930054,
+      "reward_std": 0.09773432463407516,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 660.0,
+      "completions/max_terminated_length": 660.0,
+      "completions/mean_length": 385.58929443359375,
+      "completions/mean_terminated_length": 385.58929443359375,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.0670621614650503,
+      "grad_norm": 0.6222151517868042,
+      "kl": 0.0174560546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 7612679.0,
+      "reward": 1.078125,
+      "reward_std": 0.05988196283578873,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 927.0,
+      "completions/max_terminated_length": 927.0,
+      "completions/mean_length": 420.6339416503906,
+      "completions/mean_terminated_length": 420.6339416503906,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.06809388702605107,
+      "grad_norm": 0.691880464553833,
+      "kl": 0.018096923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0125,
+      "num_tokens": 7725625.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.08505426347255707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.21299293637275696,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1575.0,
+      "completions/max_terminated_length": 1575.0,
+      "completions/mean_length": 431.6964416503906,
+      "completions/mean_terminated_length": 431.6964416503906,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.06912561258705184,
+      "grad_norm": 0.6876183152198792,
+      "kl": 0.0158538818359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0117,
+      "num_tokens": 7832768.0,
+      "reward": 1.109375,
+      "reward_std": 0.1025775894522667,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 772.0,
+      "completions/max_terminated_length": 772.0,
+      "completions/mean_length": 392.15179443359375,
+      "completions/mean_terminated_length": 392.15179443359375,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 0.07015733814805261,
+      "grad_norm": 0.7046939134597778,
+      "kl": 0.017791748046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0103,
+      "num_tokens": 7945079.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.0778050646185875,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 644.0,
+      "completions/max_terminated_length": 644.0,
+      "completions/mean_length": 356.5446472167969,
+      "completions/mean_terminated_length": 356.5446472167969,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 0.0711890637090534,
+      "grad_norm": 0.797570526599884,
+      "kl": 0.02020263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0295,
+      "num_tokens": 8048810.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.09572817385196686,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 632.0,
+      "completions/max_terminated_length": 632.0,
+      "completions/mean_length": 365.7321472167969,
+      "completions/mean_terminated_length": 365.7321472167969,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.07222078927005417,
+      "grad_norm": 0.7524502277374268,
+      "kl": 0.023345947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.046,
+      "num_tokens": 8163741.0,
+      "reward": 1.1062500476837158,
+      "reward_std": 0.09088490903377533,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 842.0,
+      "completions/max_terminated_length": 842.0,
+      "completions/mean_length": 392.5446472167969,
+      "completions/mean_terminated_length": 392.5446472167969,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.07325251483105494,
+      "grad_norm": 0.6227968335151672,
+      "kl": 0.020721435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0054,
+      "num_tokens": 8274094.0,
+      "reward": 1.0848214626312256,
+      "reward_std": 0.08387312293052673,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 725.0,
+      "completions/max_terminated_length": 725.0,
+      "completions/mean_length": 397.6339416503906,
+      "completions/mean_terminated_length": 397.6339416503906,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 0.07428424039205571,
+      "grad_norm": 0.6036502122879028,
+      "kl": 0.02008056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0134,
+      "num_tokens": 8383855.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.0671311616897583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 705.0,
+      "completions/max_terminated_length": 705.0,
+      "completions/mean_length": 379.2410888671875,
+      "completions/mean_terminated_length": 379.2410888671875,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.0753159659530565,
+      "grad_norm": 0.7189438343048096,
+      "kl": 0.02191162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.029,
+      "num_tokens": 8506648.0,
+      "reward": 1.0437500476837158,
+      "reward_std": 0.08366710692644119,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.04374999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.11627185344696045,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 948.0,
+      "completions/max_terminated_length": 948.0,
+      "completions/mean_length": 395.3750305175781,
+      "completions/mean_terminated_length": 395.3750305175781,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 0.07634769151405726,
+      "grad_norm": 0.8699377179145813,
+      "kl": 0.024932861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0219,
+      "num_tokens": 8623345.0,
+      "reward": 1.0316965579986572,
+      "reward_std": 0.0907539427280426,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.04062500223517418,
+      "rewards/curriculum_aware_reward_fn/std": 0.11261255294084549,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 736.0,
+      "completions/max_terminated_length": 736.0,
+      "completions/mean_length": 369.5357360839844,
+      "completions/mean_terminated_length": 369.5357360839844,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 0.07737941707505804,
+      "grad_norm": 0.6735115647315979,
+      "kl": 0.02252197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 8737736.0,
+      "reward": 1.0500000715255737,
+      "reward_std": 0.06851832568645477,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806,
+      "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 355.9821472167969,
+      "completions/mean_terminated_length": 355.9821472167969,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.0784111426360588,
+      "grad_norm": 0.5873785018920898,
+      "kl": 0.02325439453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 8842349.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.055438488721847534,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 687.0,
+      "completions/max_terminated_length": 687.0,
+      "completions/mean_length": 373.9107360839844,
+      "completions/mean_terminated_length": 373.9107360839844,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.07944286819705958,
+      "grad_norm": 0.863136351108551,
+      "kl": 0.02398681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0125,
+      "num_tokens": 8949845.0,
+      "reward": 1.0906250476837158,
+      "reward_std": 0.10260899364948273,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 779.0,
+      "completions/max_terminated_length": 779.0,
+      "completions/mean_length": 382.4107360839844,
+      "completions/mean_terminated_length": 382.4107360839844,
+      "completions/min_length": 142.0,
+      "completions/min_terminated_length": 142.0,
+      "epoch": 0.08047459375806036,
+      "grad_norm": 0.5056124925613403,
+      "kl": 0.025146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0016,
+      "num_tokens": 9048752.0,
+      "reward": 1.1218750476837158,
+      "reward_std": 0.053001150488853455,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 715.0,
+      "completions/max_terminated_length": 715.0,
+      "completions/mean_length": 423.02679443359375,
+      "completions/mean_terminated_length": 423.02679443359375,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.08150631931906113,
+      "grad_norm": 0.625390350818634,
+      "kl": 0.02386474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 9173222.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1038.0,
+      "completions/max_terminated_length": 1038.0,
+      "completions/mean_length": 440.1160888671875,
+      "completions/mean_terminated_length": 440.1160888671875,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.0825380448800619,
+      "grad_norm": 0.6255428791046143,
+      "kl": 0.021820068359375,
+      "learning_rate": 1e-06,
+      "loss": -0.005,
+      "num_tokens": 9292914.0,
+      "reward": 1.0906251668930054,
+      "reward_std": 0.06611238420009613,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 724.0,
+      "completions/max_terminated_length": 724.0,
+      "completions/mean_length": 355.95538330078125,
+      "completions/mean_terminated_length": 355.95538330078125,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 0.08356977044106267,
+      "grad_norm": 0.850692868232727,
+      "kl": 0.024566650390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0171,
+      "num_tokens": 9399526.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.11288311332464218,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 715.0,
+      "completions/max_terminated_length": 715.0,
+      "completions/mean_length": 360.39288330078125,
+      "completions/mean_terminated_length": 360.39288330078125,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.08460149600206344,
+      "grad_norm": 0.81606525182724,
+      "kl": 0.028076171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 9509540.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.09674695879220963,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437500596046448,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 655.0,
+      "completions/max_terminated_length": 655.0,
+      "completions/mean_length": 366.2232360839844,
+      "completions/mean_terminated_length": 366.2232360839844,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.08563322156306423,
+      "grad_norm": 0.6983723640441895,
+      "kl": 0.02325439453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0121,
+      "num_tokens": 9611618.0,
+      "reward": 1.0973215103149414,
+      "reward_std": 0.09658458083868027,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 912.0,
+      "completions/max_terminated_length": 912.0,
+      "completions/mean_length": 444.6160888671875,
+      "completions/mean_terminated_length": 444.6160888671875,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 0.086664947124065,
+      "grad_norm": 0.59740149974823,
+      "kl": 0.020477294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.014,
+      "num_tokens": 9724940.0,
+      "reward": 1.0625001192092896,
+      "reward_std": 0.08261694014072418,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0625,
+      "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 407.6964416503906,
+      "completions/mean_terminated_length": 407.6964416503906,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.08769667268506577,
+      "grad_norm": 0.6780855059623718,
+      "kl": 0.023468017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 9827526.0,
+      "reward": 1.0906250476837158,
+      "reward_std": 0.07678630203008652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 832.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 391.5535888671875,
+      "completions/mean_terminated_length": 391.5535888671875,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.08872839824606654,
+      "grad_norm": 0.6756490468978882,
+      "kl": 0.019744873046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0015,
+      "num_tokens": 9936438.0,
+      "reward": 1.0937501192092896,
+      "reward_std": 0.08122977614402771,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 806.0,
+      "completions/max_terminated_length": 806.0,
+      "completions/mean_length": 383.7321472167969,
+      "completions/mean_terminated_length": 383.7321472167969,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 0.08976012380706733,
+      "grad_norm": 0.5501047968864441,
+      "kl": 0.02313232421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 10049997.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.0612691231071949,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1270.0,
+      "completions/max_terminated_length": 1270.0,
+      "completions/mean_length": 432.8035888671875,
+      "completions/mean_terminated_length": 432.8035888671875,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.0907918493680681,
+      "grad_norm": 0.6241595149040222,
+      "kl": 0.019317626953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 10163039.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.06851832568645477,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 403.0357360839844,
+      "completions/mean_terminated_length": 403.0357360839844,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.09182357492906887,
+      "grad_norm": 0.6893149018287659,
+      "kl": 0.0233154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 10276142.0,
+      "reward": 1.0473215579986572,
+      "reward_std": 0.11236942559480667,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05625000223517418,
+      "rewards/curriculum_aware_reward_fn/std": 0.1374027132987976,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 646.0,
+      "completions/max_terminated_length": 646.0,
+      "completions/mean_length": 365.3035888671875,
+      "completions/mean_terminated_length": 365.3035888671875,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.09285530049006964,
+      "grad_norm": 0.6915412545204163,
+      "kl": 0.020111083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 10370004.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.07296179980039597,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 662.0,
+      "completions/max_terminated_length": 662.0,
+      "completions/mean_length": 402.0982360839844,
+      "completions/mean_terminated_length": 402.0982360839844,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 0.09388702605107041,
+      "grad_norm": 0.7040698528289795,
+      "kl": 0.020416259765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 10490156.0,
+      "reward": 1.0500000715255737,
+      "reward_std": 0.07678629457950592,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806,
+      "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 914.0,
+      "completions/max_terminated_length": 914.0,
+      "completions/mean_length": 396.14288330078125,
+      "completions/mean_terminated_length": 396.14288330078125,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 0.0949187516120712,
+      "grad_norm": 0.5344059467315674,
+      "kl": 0.02386474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0047,
+      "num_tokens": 10601045.0,
+      "reward": 1.071874976158142,
+      "reward_std": 0.0554070845246315,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 446.169677734375,
+      "completions/mean_terminated_length": 446.169677734375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 0.09595047717307197,
+      "grad_norm": 0.4314137101173401,
+      "kl": 0.020782470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 10729780.0,
+      "reward": 1.0531251430511475,
+      "reward_std": 0.03992130607366562,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582,
+      "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 763.0,
+      "completions/max_terminated_length": 763.0,
+      "completions/mean_length": 394.21429443359375,
+      "completions/mean_terminated_length": 394.21429443359375,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.09698220273407274,
+      "grad_norm": 0.6997676491737366,
+      "kl": 0.021209716796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0077,
+      "num_tokens": 10847187.0,
+      "reward": 1.0937501192092896,
+      "reward_std": 0.08057939261198044,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 773.0,
+      "completions/max_terminated_length": 773.0,
+      "completions/mean_length": 394.45538330078125,
+      "completions/mean_terminated_length": 394.45538330078125,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 0.09801392829507351,
+      "grad_norm": 0.7313223481178284,
+      "kl": 0.021942138671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 10953911.0,
+      "reward": 1.1687501668930054,
+      "reward_std": 0.09292246401309967,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 805.0,
+      "completions/max_terminated_length": 805.0,
+      "completions/mean_length": 402.9821472167969,
+      "completions/mean_terminated_length": 402.9821472167969,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.09904565385607428,
+      "grad_norm": 0.6087774038314819,
+      "kl": 0.024078369140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 11056746.0,
+      "reward": 1.0500000715255737,
+      "reward_std": 0.06367506086826324,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806,
+      "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1242.0,
+      "completions/max_terminated_length": 1242.0,
+      "completions/mean_length": 407.1339416503906,
+      "completions/mean_terminated_length": 407.1339416503906,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 0.10007737941707506,
+      "grad_norm": 0.6327843070030212,
+      "kl": 0.023345947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.003,
+      "num_tokens": 11170154.0,
+      "reward": 1.125,
+      "reward_std": 0.07296179980039597,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845881938934326,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 921.0,
+      "completions/max_terminated_length": 921.0,
+      "completions/mean_length": 403.0535888671875,
+      "completions/mean_terminated_length": 403.0535888671875,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 0.10110910497807583,
+      "grad_norm": 0.4795805811882019,
+      "kl": 0.022491455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0124,
+      "num_tokens": 11279031.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.028228629380464554,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 769.0,
+      "completions/max_terminated_length": 769.0,
+      "completions/mean_length": 429.8571472167969,
+      "completions/mean_terminated_length": 429.8571472167969,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 0.1021408305390766,
+      "grad_norm": 0.548716127872467,
+      "kl": 0.02044677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 11393521.0,
+      "reward": 1.078125,
+      "reward_std": 0.06509362161159515,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 797.0,
+      "completions/max_terminated_length": 797.0,
+      "completions/mean_length": 419.02679443359375,
+      "completions/mean_terminated_length": 419.02679443359375,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.10317255610007738,
+      "grad_norm": 0.5905047655105591,
+      "kl": 0.021240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 11512979.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.06953709572553635,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1100.0,
+      "completions/max_terminated_length": 1100.0,
+      "completions/mean_length": 411.2857360839844,
+      "completions/mean_terminated_length": 411.2857360839844,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.10420428166107815,
+      "grad_norm": 0.6256059408187866,
+      "kl": 0.025115966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0189,
+      "num_tokens": 11629246.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.07095565646886826,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 972.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 479.294677734375,
+      "completions/mean_terminated_length": 479.294677734375,
+      "completions/min_length": 119.0,
+      "completions/min_terminated_length": 119.0,
+      "epoch": 0.10523600722207893,
+      "grad_norm": 0.6571237444877625,
+      "kl": 0.020477294921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0005,
+      "num_tokens": 11758037.0,
+      "reward": 1.09375,
+      "reward_std": 0.08122977614402771,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1053.0,
+      "completions/max_terminated_length": 1053.0,
+      "completions/mean_length": 421.5446472167969,
+      "completions/mean_terminated_length": 421.5446472167969,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 0.1062677327830797,
+      "grad_norm": 0.5904666185379028,
+      "kl": 0.02593994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0078,
+      "num_tokens": 11871197.0,
+      "reward": 1.09375,
+      "reward_std": 0.0612691231071949,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1443.0,
+      "completions/max_terminated_length": 1443.0,
+      "completions/mean_length": 438.1964416503906,
+      "completions/mean_terminated_length": 438.1964416503906,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.10729945834408047,
+      "grad_norm": 0.7135812640190125,
+      "kl": 0.021026611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0449,
+      "num_tokens": 11986334.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.08949775248765945,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 834.0,
+      "completions/max_terminated_length": 834.0,
+      "completions/mean_length": 433.6160888671875,
+      "completions/mean_terminated_length": 433.6160888671875,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 0.10833118390508124,
+      "grad_norm": 0.748449444770813,
+      "kl": 0.025787353515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0016,
+      "num_tokens": 12104731.0,
+      "reward": 1.09375,
+      "reward_std": 0.08986613899469376,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 844.0,
+      "completions/max_terminated_length": 844.0,
+      "completions/mean_length": 421.3125305175781,
+      "completions/mean_terminated_length": 421.3125305175781,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.10936290946608203,
+      "grad_norm": 0.6255056858062744,
+      "kl": 0.023193359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 12216871.0,
+      "reward": 1.1129465103149414,
+      "reward_std": 0.09252168238162994,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 735.0,
+      "completions/max_terminated_length": 735.0,
+      "completions/mean_length": 431.8035888671875,
+      "completions/mean_terminated_length": 431.8035888671875,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.1103946350270828,
+      "grad_norm": 0.6440974473953247,
+      "kl": 0.024749755859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0021,
+      "num_tokens": 12332132.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.07296180725097656,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 904.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 467.3750305175781,
+      "completions/mean_terminated_length": 467.3750305175781,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.11142636058808357,
+      "grad_norm": 0.4589020907878876,
+      "kl": 0.020294189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 12458314.0,
+      "reward": 1.0437501668930054,
+      "reward_std": 0.0364965982735157,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.04374999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.11627185344696045,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 919.0,
+      "completions/max_terminated_length": 919.0,
+      "completions/mean_length": 454.83929443359375,
+      "completions/mean_terminated_length": 454.83929443359375,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.11245808614908434,
+      "grad_norm": 0.6246293783187866,
+      "kl": 0.022918701171875,
+      "learning_rate": 1e-06,
+      "loss": -0.005,
+      "num_tokens": 12573229.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.07882384210824966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 889.0,
+      "completions/max_terminated_length": 889.0,
+      "completions/mean_length": 470.0625305175781,
+      "completions/mean_terminated_length": 470.0625305175781,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.11348981171008511,
+      "grad_norm": 0.45937812328338623,
+      "kl": 0.022705078125,
+      "learning_rate": 1e-06,
+      "loss": -0.017,
+      "num_tokens": 12703625.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.04232724383473396,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 745.0,
+      "completions/max_terminated_length": 745.0,
+      "completions/mean_length": 402.5089416503906,
+      "completions/mean_terminated_length": 402.5089416503906,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.1145215372710859,
+      "grad_norm": 0.6906930804252625,
+      "kl": 0.030242919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 12818244.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.08465448766946793,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1179.0,
+      "completions/max_terminated_length": 1179.0,
+      "completions/mean_length": 411.76788330078125,
+      "completions/mean_terminated_length": 411.76788330078125,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.11555326283208667,
+      "grad_norm": 0.6561564207077026,
+      "kl": 0.025909423828125,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 12924310.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.06367506086826324,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 900.0,
+      "completions/max_terminated_length": 900.0,
+      "completions/mean_length": 432.3839416503906,
+      "completions/mean_terminated_length": 432.3839416503906,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.11658498839308744,
+      "grad_norm": 0.7264392971992493,
+      "kl": 0.026153564453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0138,
+      "num_tokens": 13042268.0,
+      "reward": 1.09375,
+      "reward_std": 0.09394123405218124,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1294.0,
+      "completions/max_terminated_length": 1294.0,
+      "completions/mean_length": 435.27679443359375,
+      "completions/mean_terminated_length": 435.27679443359375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.11761671395408821,
+      "grad_norm": 0.5870400071144104,
+      "kl": 0.0267333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0287,
+      "num_tokens": 13162461.0,
+      "reward": 1.1093751192092896,
+      "reward_std": 0.07296179980039597,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1016.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 478.2232360839844,
+      "completions/mean_terminated_length": 478.2232360839844,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 0.11864843951508898,
+      "grad_norm": 0.4974851608276367,
+      "kl": 0.025238037109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0021,
+      "num_tokens": 13290187.0,
+      "reward": 1.0250000953674316,
+      "reward_std": 0.054419707506895065,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.02500000037252903,
+      "rewards/curriculum_aware_reward_fn/std": 0.09054389595985413,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 985.0,
+      "completions/max_terminated_length": 985.0,
+      "completions/mean_length": 481.544677734375,
+      "completions/mean_terminated_length": 481.544677734375,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.11968016507608976,
+      "grad_norm": 0.47406908869743347,
+      "kl": 0.0255126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 13405972.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.04818928614258766,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 943.0,
+      "completions/max_terminated_length": 943.0,
+      "completions/mean_length": 441.0089416503906,
+      "completions/mean_terminated_length": 441.0089416503906,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 0.12071189063709054,
+      "grad_norm": 0.5233811736106873,
+      "kl": 0.02685546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0066,
+      "num_tokens": 13514823.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.0612691193819046,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1054.0,
+      "completions/max_terminated_length": 1054.0,
+      "completions/mean_length": 482.0000305175781,
+      "completions/mean_terminated_length": 482.0000305175781,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.1217436161980913,
+      "grad_norm": 0.575183629989624,
+      "kl": 0.02606201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0178,
+      "num_tokens": 13631562.0,
+      "reward": 1.0562500953674316,
+      "reward_std": 0.06851832568645477,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1041.0,
+      "completions/max_terminated_length": 1041.0,
+      "completions/mean_length": 471.7410888671875,
+      "completions/mean_terminated_length": 471.7410888671875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 0.12277534175909208,
+      "grad_norm": 0.6857424378395081,
+      "kl": 0.02569580078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0119,
+      "num_tokens": 13757400.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.08709181845188141,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1008.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 484.58929443359375,
+      "completions/mean_terminated_length": 484.58929443359375,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 0.12380706732009286,
+      "grad_norm": 0.4834694266319275,
+      "kl": 0.026031494140625,
+      "learning_rate": 1e-06,
+      "loss": -0.001,
+      "num_tokens": 13879826.0,
+      "reward": 1.0687501430511475,
+      "reward_std": 0.059881966561079025,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1272.0,
+      "completions/max_terminated_length": 1272.0,
+      "completions/mean_length": 500.0982360839844,
+      "completions/mean_terminated_length": 500.0982360839844,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.12483879288109363,
+      "grad_norm": 0.5000414252281189,
+      "kl": 0.024505615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0203,
+      "num_tokens": 14007160.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.06228789687156677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 719.0,
+      "completions/max_terminated_length": 719.0,
+      "completions/mean_length": 443.3214416503906,
+      "completions/mean_terminated_length": 443.3214416503906,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 0.1258705184420944,
+      "grad_norm": 0.6494409441947937,
+      "kl": 0.024810791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 14124453.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.08746020495891571,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1008.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 460.3035888671875,
+      "completions/mean_terminated_length": 460.3035888671875,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.1269022440030952,
+      "grad_norm": 0.5224548578262329,
+      "kl": 0.0277099609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 14248128.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.0616375133395195,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 432.02679443359375,
+      "completions/mean_terminated_length": 432.02679443359375,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 0.12793396956409595,
+      "grad_norm": 0.7065784335136414,
+      "kl": 0.028961181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0206,
+      "num_tokens": 14367362.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.08607304841279984,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1797.0,
+      "completions/max_terminated_length": 1797.0,
+      "completions/mean_length": 461.1607360839844,
+      "completions/mean_terminated_length": 461.1607360839844,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 0.12896569512509673,
+      "grad_norm": 0.7404457330703735,
+      "kl": 0.03076171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0328,
+      "num_tokens": 14487052.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.12836889922618866,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 973.0,
+      "completions/max_terminated_length": 973.0,
+      "completions/mean_length": 425.0714416503906,
+      "completions/mean_terminated_length": 425.0714416503906,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.1299974206860975,
+      "grad_norm": 0.7103374600410461,
+      "kl": 0.025634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0045,
+      "num_tokens": 14603547.0,
+      "reward": 1.1125000715255737,
+      "reward_std": 0.08847897499799728,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 912.0,
+      "completions/max_terminated_length": 912.0,
+      "completions/mean_length": 390.2321472167969,
+      "completions/mean_terminated_length": 390.2321472167969,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.13102914624709827,
+      "grad_norm": 0.6540249586105347,
+      "kl": 0.029022216796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0149,
+      "num_tokens": 14708335.0,
+      "reward": 1.140625,
+      "reward_std": 0.09773431718349457,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 442.0000305175781,
+      "completions/mean_terminated_length": 442.0000305175781,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 0.13206087180809906,
+      "grad_norm": 0.5070663094520569,
+      "kl": 0.0247802734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0073,
+      "num_tokens": 14830408.0,
+      "reward": 1.0656250715255737,
+      "reward_std": 0.06330667436122894,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.13722331821918488,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1415.0,
+      "completions/max_terminated_length": 1415.0,
+      "completions/mean_length": 449.7232360839844,
+      "completions/mean_terminated_length": 449.7232360839844,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.1330925973690998,
+      "grad_norm": 0.5839290022850037,
+      "kl": 0.02423095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.003,
+      "num_tokens": 14948593.0,
+      "reward": 1.1218750476837158,
+      "reward_std": 0.07438036054372787,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 716.0,
+      "completions/max_terminated_length": 716.0,
+      "completions/mean_length": 395.7589416503906,
+      "completions/mean_terminated_length": 395.7589416503906,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.1341243229301006,
+      "grad_norm": 0.6906790137290955,
+      "kl": 0.027679443359375,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 15069365.0,
+      "reward": 1.084375023841858,
+      "reward_std": 0.09088490903377533,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 797.0,
+      "completions/max_terminated_length": 797.0,
+      "completions/mean_length": 379.3750305175781,
+      "completions/mean_terminated_length": 379.3750305175781,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.13515604849110135,
+      "grad_norm": 0.8479922413825989,
+      "kl": 0.03192138671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 15179328.0,
+      "reward": 1.1062501668930054,
+      "reward_std": 0.10843963176012039,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 804.0,
+      "completions/max_terminated_length": 804.0,
+      "completions/mean_length": 446.8660888671875,
+      "completions/mean_terminated_length": 446.8660888671875,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 0.13618777405210214,
+      "grad_norm": 0.49666714668273926,
+      "kl": 0.02545166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0134,
+      "num_tokens": 15295282.0,
+      "reward": 1.078125,
+      "reward_std": 0.06509362161159515,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 888.0,
+      "completions/max_terminated_length": 888.0,
+      "completions/mean_length": 397.20538330078125,
+      "completions/mean_terminated_length": 397.20538330078125,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.13721949961310292,
+      "grad_norm": 0.66218501329422,
+      "kl": 0.02874755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0243,
+      "num_tokens": 15402398.0,
+      "reward": 1.100000023841858,
+      "reward_std": 0.07539913058280945,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 761.0,
+      "completions/max_terminated_length": 761.0,
+      "completions/mean_length": 404.3214416503906,
+      "completions/mean_terminated_length": 404.3214416503906,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.13825122517410368,
+      "grad_norm": 0.6109785437583923,
+      "kl": 0.025726318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 15510143.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.060281746089458466,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 714.0,
+      "completions/max_terminated_length": 714.0,
+      "completions/mean_length": 374.3125305175781,
+      "completions/mean_terminated_length": 374.3125305175781,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 0.13928295073510447,
+      "grad_norm": 0.5623469948768616,
+      "kl": 0.0333251953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0078,
+      "num_tokens": 15614842.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.05682564526796341,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 848.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 453.794677734375,
+      "completions/mean_terminated_length": 453.794677734375,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 0.14031467629610522,
+      "grad_norm": 0.532106876373291,
+      "kl": 0.026397705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 15738971.0,
+      "reward": 1.0625,
+      "reward_std": 0.06709976494312286,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0625,
+      "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 956.0,
+      "completions/max_terminated_length": 956.0,
+      "completions/mean_length": 369.7857360839844,
+      "completions/mean_terminated_length": 369.7857360839844,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.141346401857106,
+      "grad_norm": 0.6140812039375305,
+      "kl": 0.030914306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0049,
+      "num_tokens": 15854524.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.05096359923481941,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 785.0,
+      "completions/max_terminated_length": 785.0,
+      "completions/mean_length": 332.6875,
+      "completions/mean_terminated_length": 332.6875,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 0.1423781274181068,
+      "grad_norm": 0.7683741450309753,
+      "kl": 0.0350341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 15957291.0,
+      "reward": 1.1218750476837158,
+      "reward_std": 0.08706042170524597,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 629.0,
+      "completions/max_terminated_length": 629.0,
+      "completions/mean_length": 328.9285888671875,
+      "completions/mean_terminated_length": 328.9285888671875,
+      "completions/min_length": 113.0,
+      "completions/min_terminated_length": 113.0,
+      "epoch": 0.14340985297910755,
+      "grad_norm": 0.6324326395988464,
+      "kl": 0.03460693359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0228,
+      "num_tokens": 16065335.0,
+      "reward": 1.1500002145767212,
+      "reward_std": 0.05543847754597664,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 662.0,
+      "completions/max_terminated_length": 662.0,
+      "completions/mean_length": 360.6071472167969,
+      "completions/mean_terminated_length": 360.6071472167969,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 0.14444157854010833,
+      "grad_norm": 0.6328318119049072,
+      "kl": 0.034393310546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0093,
+      "num_tokens": 16179272.0,
+      "reward": 1.109375,
+      "reward_std": 0.0612691268324852,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 598.0,
+      "completions/max_terminated_length": 598.0,
+      "completions/mean_length": 337.7410888671875,
+      "completions/mean_terminated_length": 337.7410888671875,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.14547330410110912,
+      "grad_norm": 0.7572005391120911,
+      "kl": 0.03558349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0211,
+      "num_tokens": 16277775.0,
+      "reward": 1.09375,
+      "reward_std": 0.08261694014072418,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1168.0,
+      "completions/max_terminated_length": 1168.0,
+      "completions/mean_length": 392.33038330078125,
+      "completions/mean_terminated_length": 392.33038330078125,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "epoch": 0.14650502966210988,
+      "grad_norm": 0.6479676365852356,
+      "kl": 0.03375244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 16392570.0,
+      "reward": 1.0785715579986572,
+      "reward_std": 0.07929752767086029,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1152.0,
+      "completions/max_terminated_length": 1152.0,
+      "completions/mean_length": 432.8214416503906,
+      "completions/mean_terminated_length": 432.8214416503906,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.14753675522311066,
+      "grad_norm": 0.5537639260292053,
+      "kl": 0.028564453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0156,
+      "num_tokens": 16515001.0,
+      "reward": 1.068750023841858,
+      "reward_std": 0.07296179980039597,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 987.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 372.6071472167969,
+      "completions/mean_terminated_length": 372.6071472167969,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "epoch": 0.14856848078411142,
+      "grad_norm": 0.7606719732284546,
+      "kl": 0.03546142578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0072,
+      "num_tokens": 16620524.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.08746019750833511,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1073.0,
+      "completions/max_terminated_length": 1073.0,
+      "completions/mean_length": 375.5446472167969,
+      "completions/mean_terminated_length": 375.5446472167969,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.1496002063451122,
+      "grad_norm": 0.6230894327163696,
+      "kl": 0.0325927734375,
+      "learning_rate": 1e-06,
+      "loss": -0.015,
+      "num_tokens": 16728742.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.08465448766946793,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843402802944183,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 724.0,
+      "completions/max_terminated_length": 724.0,
+      "completions/mean_length": 376.21429443359375,
+      "completions/mean_terminated_length": 376.21429443359375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.150631931906113,
+      "grad_norm": 0.6115962862968445,
+      "kl": 0.0338134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0162,
+      "num_tokens": 16844434.0,
+      "reward": 1.0598214864730835,
+      "reward_std": 0.08146720379590988,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 759.0,
+      "completions/max_terminated_length": 759.0,
+      "completions/mean_length": 373.26788330078125,
+      "completions/mean_terminated_length": 373.26788330078125,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 0.15166365746711374,
+      "grad_norm": 0.7758038640022278,
+      "kl": 0.03533935546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0179,
+      "num_tokens": 16948799.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.08505426347255707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 786.0,
+      "completions/max_terminated_length": 786.0,
+      "completions/mean_length": 368.0089416503906,
+      "completions/mean_terminated_length": 368.0089416503906,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.15269538302811453,
+      "grad_norm": 0.7160394191741943,
+      "kl": 0.037841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0157,
+      "num_tokens": 17055816.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.07817345857620239,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729717910289764,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 719.0,
+      "completions/max_terminated_length": 719.0,
+      "completions/mean_length": 385.4821472167969,
+      "completions/mean_terminated_length": 385.4821472167969,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 0.15372710858911529,
+      "grad_norm": 0.79217529296875,
+      "kl": 0.03387451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 17157433.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.10705246776342392,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 925.0,
+      "completions/max_terminated_length": 925.0,
+      "completions/mean_length": 374.58929443359375,
+      "completions/mean_terminated_length": 374.58929443359375,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.15475883415011607,
+      "grad_norm": 0.6225085258483887,
+      "kl": 0.032196044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 17265978.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.06330667436122894,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 962.0,
+      "completions/max_terminated_length": 962.0,
+      "completions/mean_length": 393.5357360839844,
+      "completions/mean_terminated_length": 393.5357360839844,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.15579055971111685,
+      "grad_norm": 0.6503400206565857,
+      "kl": 0.03387451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0184,
+      "num_tokens": 17374356.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.07780507206916809,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 874.0,
+      "completions/max_terminated_length": 874.0,
+      "completions/mean_length": 392.0982360839844,
+      "completions/mean_terminated_length": 392.0982360839844,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 0.1568222852721176,
+      "grad_norm": 0.7472891807556152,
+      "kl": 0.033172607421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0196,
+      "num_tokens": 17486423.0,
+      "reward": 1.140625,
+      "reward_std": 0.10119043290615082,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 887.0,
+      "completions/max_terminated_length": 887.0,
+      "completions/mean_length": 367.26788330078125,
+      "completions/mean_terminated_length": 367.26788330078125,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 0.1578540108331184,
+      "grad_norm": 0.7780349254608154,
+      "kl": 0.0379638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0195,
+      "num_tokens": 17586854.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.10637068003416061,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1003.0,
+      "completions/max_terminated_length": 1003.0,
+      "completions/mean_length": 424.5625305175781,
+      "completions/mean_terminated_length": 424.5625305175781,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.15888573639411915,
+      "grad_norm": 0.5371277928352356,
+      "kl": 0.04486083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 17708969.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 723.0,
+      "completions/max_terminated_length": 723.0,
+      "completions/mean_length": 361.1160888671875,
+      "completions/mean_terminated_length": 361.1160888671875,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.15991746195511994,
+      "grad_norm": 0.650229811668396,
+      "kl": 0.03973388671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0119,
+      "num_tokens": 17814178.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.07882384210824966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 587.0,
+      "completions/max_terminated_length": 587.0,
+      "completions/mean_length": 369.1875305175781,
+      "completions/mean_terminated_length": 369.1875305175781,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.16094918751612072,
+      "grad_norm": 0.7069404125213623,
+      "kl": 0.03704833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 17927543.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.09532840549945831,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 964.0,
+      "completions/max_terminated_length": 964.0,
+      "completions/mean_length": 362.58929443359375,
+      "completions/mean_terminated_length": 362.58929443359375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.16198091307712148,
+      "grad_norm": 0.6486988067626953,
+      "kl": 0.04095458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0077,
+      "num_tokens": 18035395.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.059231579303741455,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 775.0,
+      "completions/max_terminated_length": 775.0,
+      "completions/mean_length": 360.27679443359375,
+      "completions/mean_terminated_length": 360.27679443359375,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.16301263863812226,
+      "grad_norm": 0.7021763920783997,
+      "kl": 0.03875732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 18135925.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.08400409668684006,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 955.0,
+      "completions/max_terminated_length": 955.0,
+      "completions/mean_length": 335.83929443359375,
+      "completions/mean_terminated_length": 335.83929443359375,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.16404436419912302,
+      "grad_norm": 0.7946338057518005,
+      "kl": 0.04852294921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0231,
+      "num_tokens": 18237057.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.10982678830623627,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 364.2321472167969,
+      "completions/mean_terminated_length": 364.2321472167969,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 0.1650760897601238,
+      "grad_norm": 0.5329174995422363,
+      "kl": 0.0428466796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0073,
+      "num_tokens": 18350553.0,
+      "reward": 1.0593751668930054,
+      "reward_std": 0.053001150488853455,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 710.0,
+      "completions/max_terminated_length": 710.0,
+      "completions/mean_length": 325.3571472167969,
+      "completions/mean_terminated_length": 325.3571472167969,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.1661078153211246,
+      "grad_norm": 0.6059619188308716,
+      "kl": 0.05255126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0246,
+      "num_tokens": 18455292.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.06953709572553635,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 830.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 351.8214416503906,
+      "completions/mean_terminated_length": 351.8214416503906,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.16713954088212535,
+      "grad_norm": 0.787862241268158,
+      "kl": 0.0474853515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0233,
+      "num_tokens": 18557857.0,
+      "reward": 1.140625238418579,
+      "reward_std": 0.10260899364948273,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1058.0,
+      "completions/max_terminated_length": 1058.0,
+      "completions/mean_length": 379.33929443359375,
+      "completions/mean_terminated_length": 379.33929443359375,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.16817126644312613,
+      "grad_norm": 0.5836665034294128,
+      "kl": 0.0445556640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 18668040.0,
+      "reward": 1.0656250715255737,
+      "reward_std": 0.06130051985383034,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.13722331821918488,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 666.0,
+      "completions/max_terminated_length": 666.0,
+      "completions/mean_length": 305.39288330078125,
+      "completions/mean_terminated_length": 305.39288330078125,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 0.1692029920041269,
+      "grad_norm": 0.588325023651123,
+      "kl": 0.0758056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 18773707.0,
+      "reward": 1.1531251668930054,
+      "reward_std": 0.05401992052793503,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 933.0,
+      "completions/max_terminated_length": 933.0,
+      "completions/mean_length": 349.0535888671875,
+      "completions/mean_terminated_length": 349.0535888671875,
+      "completions/min_length": 133.0,
+      "completions/min_terminated_length": 133.0,
+      "epoch": 0.17023471756512767,
+      "grad_norm": 0.5635197162628174,
+      "kl": 0.04345703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0101,
+      "num_tokens": 18882122.0,
+      "reward": 1.09375,
+      "reward_std": 0.07882384210824966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569154918193817,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 473.0,
+      "completions/max_terminated_length": 473.0,
+      "completions/mean_length": 251.52679443359375,
+      "completions/mean_terminated_length": 251.52679443359375,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 0.17126644312612846,
+      "grad_norm": 0.9392991662025452,
+      "kl": 0.05517578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0068,
+      "num_tokens": 18963171.0,
+      "reward": 1.2062500715255737,
+      "reward_std": 0.11976392567157745,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20624998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 319.33929443359375,
+      "completions/mean_terminated_length": 319.33929443359375,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 0.17229816868712922,
+      "grad_norm": 0.6024711728096008,
+      "kl": 0.0548095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 19059935.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.05198238044977188,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 702.0,
+      "completions/max_terminated_length": 702.0,
+      "completions/mean_length": 327.8482360839844,
+      "completions/mean_terminated_length": 327.8482360839844,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.17332989424813,
+      "grad_norm": 0.6369600296020508,
+      "kl": 0.0540771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 19154831.0,
+      "reward": 1.1062501668930054,
+      "reward_std": 0.055438488721847534,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 750.0,
+      "completions/max_terminated_length": 750.0,
+      "completions/mean_length": 339.6696472167969,
+      "completions/mean_terminated_length": 339.6696472167969,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.17436161980913076,
+      "grad_norm": 0.6260164380073547,
+      "kl": 0.0457763671875,
+      "learning_rate": 1e-06,
+      "loss": -0.009,
+      "num_tokens": 19253165.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.06228789687156677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1352.0,
+      "completions/max_terminated_length": 1352.0,
+      "completions/mean_length": 331.9107360839844,
+      "completions/mean_terminated_length": 331.9107360839844,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 0.17539334537013154,
+      "grad_norm": 0.6983166933059692,
+      "kl": 0.0447998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0232,
+      "num_tokens": 19356020.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.0981341153383255,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.17190392315387726,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1015.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 369.8035888671875,
+      "completions/mean_terminated_length": 369.8035888671875,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.17642507093113233,
+      "grad_norm": 0.3841709792613983,
+      "kl": 0.046875,
+      "learning_rate": 1e-06,
+      "loss": 0.009,
+      "num_tokens": 19471267.0,
+      "reward": 1.0593751668930054,
+      "reward_std": 0.043346013873815536,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 703.0,
+      "completions/max_terminated_length": 703.0,
+      "completions/mean_length": 329.77679443359375,
+      "completions/mean_terminated_length": 329.77679443359375,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.17745679649213308,
+      "grad_norm": 0.6240721344947815,
+      "kl": 0.0479736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0103,
+      "num_tokens": 19569081.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 678.0,
+      "completions/max_terminated_length": 678.0,
+      "completions/mean_length": 363.8660888671875,
+      "completions/mean_terminated_length": 363.8660888671875,
+      "completions/min_length": 133.0,
+      "completions/min_terminated_length": 133.0,
+      "epoch": 0.17848852205313387,
+      "grad_norm": 0.7446913719177246,
+      "kl": 0.04083251953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0144,
+      "num_tokens": 19669096.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.08122977614402771,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 559.0,
+      "completions/max_terminated_length": 559.0,
+      "completions/mean_length": 320.9732360839844,
+      "completions/mean_terminated_length": 320.9732360839844,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.17952024761413465,
+      "grad_norm": 0.7765859365463257,
+      "kl": 0.07684326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 19765840.0,
+      "reward": 1.125,
+      "reward_std": 0.0671311616897583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 717.0,
+      "completions/max_terminated_length": 717.0,
+      "completions/mean_length": 360.9285888671875,
+      "completions/mean_terminated_length": 360.9285888671875,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 0.1805519731751354,
+      "grad_norm": 0.7132924199104309,
+      "kl": 0.04248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0156,
+      "num_tokens": 19867556.0,
+      "reward": 1.1093751192092896,
+      "reward_std": 0.09430961310863495,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1154.0,
+      "completions/max_terminated_length": 1154.0,
+      "completions/mean_length": 376.3750305175781,
+      "completions/mean_terminated_length": 376.3750305175781,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 0.1815836987361362,
+      "grad_norm": 0.5997862815856934,
+      "kl": 0.04632568359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 19973858.0,
+      "reward": 1.15625,
+      "reward_std": 0.08159816265106201,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.21446822583675385,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 700.0,
+      "completions/max_terminated_length": 700.0,
+      "completions/mean_length": 366.76788330078125,
+      "completions/mean_terminated_length": 366.76788330078125,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.18261542429713695,
+      "grad_norm": 0.5875352025032043,
+      "kl": 0.0478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 20090788.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.0781734511256218,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1195.0,
+      "completions/max_terminated_length": 1195.0,
+      "completions/mean_length": 409.6250305175781,
+      "completions/mean_terminated_length": 409.6250305175781,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 0.18364714985813774,
+      "grad_norm": 0.6117048263549805,
+      "kl": 0.0433349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0216,
+      "num_tokens": 20214415.0,
+      "reward": 1.1093751192092896,
+      "reward_std": 0.09051652252674103,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1059.0,
+      "completions/max_terminated_length": 1059.0,
+      "completions/mean_length": 322.4375,
+      "completions/mean_terminated_length": 322.4375,
+      "completions/min_length": 142.0,
+      "completions/min_terminated_length": 142.0,
+      "epoch": 0.18467887541913852,
+      "grad_norm": 0.6856574416160583,
+      "kl": 0.05731201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0232,
+      "num_tokens": 20318302.0,
+      "reward": 1.1160715818405151,
+      "reward_std": 0.07319922745227814,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 827.0,
+      "completions/max_terminated_length": 827.0,
+      "completions/mean_length": 375.40179443359375,
+      "completions/mean_terminated_length": 375.40179443359375,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.18571060098013928,
+      "grad_norm": 0.6979689598083496,
+      "kl": 0.0533447265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0019,
+      "num_tokens": 20428394.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.07919223606586456,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1192.0,
+      "completions/max_terminated_length": 1192.0,
+      "completions/mean_length": 351.1785888671875,
+      "completions/mean_terminated_length": 351.1785888671875,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.18674232654114006,
+      "grad_norm": 0.6818945407867432,
+      "kl": 0.0538330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0187,
+      "num_tokens": 20538370.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.07576751708984375,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 671.0,
+      "completions/max_terminated_length": 671.0,
+      "completions/mean_length": 335.375,
+      "completions/mean_terminated_length": 335.375,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 0.18777405210214082,
+      "grad_norm": 0.8694007992744446,
+      "kl": 0.0506591796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 20648872.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.1025775894522667,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16249999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 743.0,
+      "completions/max_terminated_length": 743.0,
+      "completions/mean_length": 358.64288330078125,
+      "completions/mean_terminated_length": 358.64288330078125,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 0.1888057776631416,
+      "grad_norm": 0.5991372466087341,
+      "kl": 0.0462646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 20752495.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.05645725876092911,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1019.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 357.65179443359375,
+      "completions/mean_terminated_length": 357.65179443359375,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.1898375032241424,
+      "grad_norm": 0.7000672221183777,
+      "kl": 0.04669189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0039,
+      "num_tokens": 20853662.0,
+      "reward": 1.109375,
+      "reward_std": 0.08502288162708282,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 377.95538330078125,
+      "completions/mean_terminated_length": 377.95538330078125,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 0.19086922878514315,
+      "grad_norm": 0.6148974299430847,
+      "kl": 0.0625,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 20965608.0,
+      "reward": 1.0906250476837158,
+      "reward_std": 0.07231142371892929,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1100.0,
+      "completions/max_terminated_length": 1100.0,
+      "completions/mean_length": 381.6339416503906,
+      "completions/mean_terminated_length": 381.6339416503906,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.19190095434614393,
+      "grad_norm": 0.655315637588501,
+      "kl": 0.052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0075,
+      "num_tokens": 21079142.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.06509362161159515,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 596.0,
+      "completions/max_terminated_length": 596.0,
+      "completions/mean_length": 331.65179443359375,
+      "completions/mean_terminated_length": 331.65179443359375,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.1929326799071447,
+      "grad_norm": 0.6411604285240173,
+      "kl": 0.051025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 21179616.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.05645725876092911,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15000000596046448,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 803.0,
+      "completions/max_terminated_length": 803.0,
+      "completions/mean_length": 345.6964416503906,
+      "completions/mean_terminated_length": 345.6964416503906,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.19396440546814547,
+      "grad_norm": 0.6874846816062927,
+      "kl": 0.05078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0147,
+      "num_tokens": 21284729.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.07882384210824966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 682.0,
+      "completions/max_terminated_length": 682.0,
+      "completions/mean_length": 346.27679443359375,
+      "completions/mean_terminated_length": 346.27679443359375,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 0.19499613102914626,
+      "grad_norm": 0.7140980362892151,
+      "kl": 0.0557861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0166,
+      "num_tokens": 21387114.0,
+      "reward": 1.0750001668930054,
+      "reward_std": 0.08264832943677902,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 639.0,
+      "completions/max_terminated_length": 639.0,
+      "completions/mean_length": 317.1964416503906,
+      "completions/mean_terminated_length": 317.1964416503906,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.19602785659014701,
+      "grad_norm": 0.760844886302948,
+      "kl": 0.0556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 21492696.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 766.0,
+      "completions/max_terminated_length": 766.0,
+      "completions/mean_length": 363.3839416503906,
+      "completions/mean_terminated_length": 363.3839416503906,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 0.1970595821511478,
+      "grad_norm": 0.8610868453979492,
+      "kl": 0.05108642578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0172,
+      "num_tokens": 21592888.0,
+      "reward": 1.125,
+      "reward_std": 0.1225382462143898,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1249999925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 707.0,
+      "completions/max_terminated_length": 707.0,
+      "completions/mean_length": 325.26788330078125,
+      "completions/mean_terminated_length": 325.26788330078125,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.19809130771214856,
+      "grad_norm": 0.6842600107192993,
+      "kl": 0.05517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 21690848.0,
+      "reward": 1.15625,
+      "reward_std": 0.07231142371892929,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.174774631857872,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 710.0,
+      "completions/max_terminated_length": 710.0,
+      "completions/mean_length": 322.7232360839844,
+      "completions/mean_terminated_length": 322.7232360839844,
+      "completions/min_length": 122.0,
+      "completions/min_terminated_length": 122.0,
+      "epoch": 0.19912303327314934,
+      "grad_norm": 0.7429232597351074,
+      "kl": 0.05718994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 21797154.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.08746020495891571,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 346.83038330078125,
+      "completions/mean_terminated_length": 346.83038330078125,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.20015475883415013,
+      "grad_norm": 0.8076887130737305,
+      "kl": 0.06341552734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0117,
+      "num_tokens": 21907826.0,
+      "reward": 1.1125000715255737,
+      "reward_std": 0.09535979479551315,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1751.0,
+      "completions/max_terminated_length": 1751.0,
+      "completions/mean_length": 386.58038330078125,
+      "completions/mean_terminated_length": 386.58038330078125,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.20118648439515088,
+      "grad_norm": 0.47773370146751404,
+      "kl": 0.04888916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0154,
+      "num_tokens": 22020528.0,
+      "reward": 1.1125000715255737,
+      "reward_std": 0.05198238044977188,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.20593667030334473,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 661.0,
+      "completions/max_terminated_length": 661.0,
+      "completions/mean_length": 337.0535888671875,
+      "completions/mean_terminated_length": 337.0535888671875,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 0.20221820995615167,
+      "grad_norm": 0.6952309608459473,
+      "kl": 0.0533447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0184,
+      "num_tokens": 22116819.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.0812297835946083,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 950.0,
+      "completions/max_terminated_length": 950.0,
+      "completions/mean_length": 383.46429443359375,
+      "completions/mean_terminated_length": 383.46429443359375,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.20324993551715242,
+      "grad_norm": 0.7165126204490662,
+      "kl": 0.05242919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0237,
+      "num_tokens": 22221465.0,
+      "reward": 1.09375,
+      "reward_std": 0.09430960565805435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 612.0,
+      "completions/max_terminated_length": 612.0,
+      "completions/mean_length": 340.7589416503906,
+      "completions/mean_terminated_length": 340.7589416503906,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.2042816610781532,
+      "grad_norm": 0.5596606135368347,
+      "kl": 0.05694580078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0137,
+      "num_tokens": 22319634.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.06025035306811333,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 549.0,
+      "completions/max_terminated_length": 549.0,
+      "completions/mean_length": 318.96429443359375,
+      "completions/mean_terminated_length": 318.96429443359375,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 0.205313386639154,
+      "grad_norm": 0.5254331231117249,
+      "kl": 0.063720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 22417224.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.049576446413993835,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 864.0,
+      "completions/max_terminated_length": 864.0,
+      "completions/mean_length": 358.9196472167969,
+      "completions/mean_terminated_length": 358.9196472167969,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 0.20634511220015475,
+      "grad_norm": 0.552406907081604,
+      "kl": 0.05364990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 22534311.0,
+      "reward": 1.0937501192092896,
+      "reward_std": 0.05198238044977188,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 726.0,
+      "completions/max_terminated_length": 726.0,
+      "completions/mean_length": 318.2321472167969,
+      "completions/mean_terminated_length": 318.2321472167969,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "epoch": 0.20737683776115554,
+      "grad_norm": 0.6902104020118713,
+      "kl": 0.06365966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0205,
+      "num_tokens": 22636453.0,
+      "reward": 1.09375,
+      "reward_std": 0.07536774128675461,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 595.0,
+      "completions/max_terminated_length": 595.0,
+      "completions/mean_length": 358.1339416503906,
+      "completions/mean_terminated_length": 358.1339416503906,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 0.2084085633221563,
+      "grad_norm": 0.45866531133651733,
+      "kl": 0.054931640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0043,
+      "num_tokens": 22744485.0,
+      "reward": 1.0500000715255737,
+      "reward_std": 0.04130847007036209,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806,
+      "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 807.0,
+      "completions/max_terminated_length": 807.0,
+      "completions/mean_length": 307.2589416503906,
+      "completions/mean_terminated_length": 307.2589416503906,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 0.20944028888315708,
+      "grad_norm": 0.5563469529151917,
+      "kl": 0.069580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 22838671.0,
+      "reward": 1.0781251192092896,
+      "reward_std": 0.028228627517819405,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 599.0,
+      "completions/max_terminated_length": 599.0,
+      "completions/mean_length": 305.9107360839844,
+      "completions/mean_terminated_length": 305.9107360839844,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.21047201444415786,
+      "grad_norm": 0.9100491404533386,
+      "kl": 0.068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0135,
+      "num_tokens": 22942639.0,
+      "reward": 1.1375001668930054,
+      "reward_std": 0.10803984105587006,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13749998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1178.0,
+      "completions/max_terminated_length": 1178.0,
+      "completions/mean_length": 385.33038330078125,
+      "completions/mean_terminated_length": 385.33038330078125,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 0.21150374000515862,
+      "grad_norm": 0.7055003046989441,
+      "kl": 0.05218505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 23049748.0,
+      "reward": 1.1160715818405151,
+      "reward_std": 0.10426744073629379,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845881938934326,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1795.0,
+      "completions/max_terminated_length": 1795.0,
+      "completions/mean_length": 399.8214416503906,
+      "completions/mean_terminated_length": 399.8214416503906,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 0.2125354655661594,
+      "grad_norm": 0.6340644955635071,
+      "kl": 0.0445556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0182,
+      "num_tokens": 23167216.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.07333020120859146,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1077.0,
+      "completions/max_terminated_length": 1077.0,
+      "completions/mean_length": 404.6607360839844,
+      "completions/mean_terminated_length": 404.6607360839844,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.2135671911271602,
+      "grad_norm": 0.48796433210372925,
+      "kl": 0.052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0068,
+      "num_tokens": 23297098.0,
+      "reward": 1.0437500476837158,
+      "reward_std": 0.05198238044977188,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.04374999925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.11627185344696045,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 723.0,
+      "completions/max_terminated_length": 723.0,
+      "completions/mean_length": 345.52679443359375,
+      "completions/mean_terminated_length": 345.52679443359375,
+      "completions/min_length": 121.0,
+      "completions/min_terminated_length": 121.0,
+      "epoch": 0.21459891668816095,
+      "grad_norm": 0.6831227540969849,
+      "kl": 0.05828857421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0164,
+      "num_tokens": 23405650.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.06228789687156677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 556.0,
+      "completions/max_terminated_length": 556.0,
+      "completions/mean_length": 330.5982360839844,
+      "completions/mean_terminated_length": 330.5982360839844,
+      "completions/min_length": 88.0,
+      "completions/min_terminated_length": 88.0,
+      "epoch": 0.21563064224916173,
+      "grad_norm": 0.6812708377838135,
+      "kl": 0.0550537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0184,
+      "num_tokens": 23508113.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.06130051985383034,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 830.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 375.83038330078125,
+      "completions/mean_terminated_length": 375.83038330078125,
+      "completions/min_length": 142.0,
+      "completions/min_terminated_length": 142.0,
+      "epoch": 0.2166623678101625,
+      "grad_norm": 0.6429232358932495,
+      "kl": 0.04766845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.019,
+      "num_tokens": 23622033.0,
+      "reward": 1.1062500476837158,
+      "reward_std": 0.0812297835946083,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 593.0,
+      "completions/max_terminated_length": 593.0,
+      "completions/mean_length": 304.2410888671875,
+      "completions/mean_terminated_length": 304.2410888671875,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "epoch": 0.21769409337116327,
+      "grad_norm": 0.8792291283607483,
+      "kl": 0.06829833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0112,
+      "num_tokens": 23715664.0,
+      "reward": 1.1593750715255737,
+      "reward_std": 0.10159020870923996,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 723.0,
+      "completions/max_terminated_length": 723.0,
+      "completions/mean_length": 362.6696472167969,
+      "completions/mean_terminated_length": 362.6696472167969,
+      "completions/min_length": 122.0,
+      "completions/min_terminated_length": 122.0,
+      "epoch": 0.21872581893216406,
+      "grad_norm": 0.7209181189537048,
+      "kl": 0.053955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 23823280.0,
+      "reward": 1.0906251668930054,
+      "reward_std": 0.0860416442155838,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 863.0,
+      "completions/max_terminated_length": 863.0,
+      "completions/mean_length": 367.6071472167969,
+      "completions/mean_terminated_length": 367.6071472167969,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.2197575444931648,
+      "grad_norm": 0.4966450035572052,
+      "kl": 0.0498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0063,
+      "num_tokens": 23930238.0,
+      "reward": 1.0562500953674316,
+      "reward_std": 0.04130847379565239,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05624999478459358,
+      "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 830.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 351.3571472167969,
+      "completions/mean_terminated_length": 351.3571472167969,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.2207892700541656,
+      "grad_norm": 0.7580758929252625,
+      "kl": 0.05255126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 24030686.0,
+      "reward": 1.0973215103149414,
+      "reward_std": 0.11033187061548233,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 668.0,
+      "completions/max_terminated_length": 668.0,
+      "completions/mean_length": 342.2589416503906,
+      "completions/mean_terminated_length": 342.2589416503906,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.22182099561516636,
+      "grad_norm": 0.7508425116539001,
+      "kl": 0.05023193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0152,
+      "num_tokens": 24134280.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.09674695879220963,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 622.0,
+      "completions/max_terminated_length": 622.0,
+      "completions/mean_length": 361.6071472167969,
+      "completions/mean_terminated_length": 361.6071472167969,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 0.22285272117616714,
+      "grad_norm": 0.7472397685050964,
+      "kl": 0.05438232421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0032,
+      "num_tokens": 24236149.0,
+      "reward": 1.1062501668930054,
+      "reward_std": 0.11084556579589844,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 779.0,
+      "completions/max_terminated_length": 779.0,
+      "completions/mean_length": 345.1785888671875,
+      "completions/mean_terminated_length": 345.1785888671875,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.22388444673716792,
+      "grad_norm": 0.7892441153526306,
+      "kl": 0.05853271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0154,
+      "num_tokens": 24350395.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.08847897499799728,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1016.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 383.6160888671875,
+      "completions/mean_terminated_length": 383.6160888671875,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 0.22491617229816868,
+      "grad_norm": 0.6343303918838501,
+      "kl": 0.0494384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 24463178.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.08465448766946793,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 658.0,
+      "completions/max_terminated_length": 658.0,
+      "completions/mean_length": 350.3571472167969,
+      "completions/mean_terminated_length": 350.3571472167969,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.22594789785916947,
+      "grad_norm": 0.7159443497657776,
+      "kl": 0.0528564453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0096,
+      "num_tokens": 24574693.0,
+      "reward": 1.1093751192092896,
+      "reward_std": 0.07539913803339005,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 736.0,
+      "completions/max_terminated_length": 736.0,
+      "completions/mean_length": 372.33038330078125,
+      "completions/mean_terminated_length": 372.33038330078125,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.22697962342017022,
+      "grad_norm": 0.6375694870948792,
+      "kl": 0.04815673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 24693037.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.07539913803339005,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 618.0,
+      "completions/max_terminated_length": 618.0,
+      "completions/mean_length": 330.7857360839844,
+      "completions/mean_terminated_length": 330.7857360839844,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.228011348981171,
+      "grad_norm": 0.6578212380409241,
+      "kl": 0.06341552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0095,
+      "num_tokens": 24797355.0,
+      "reward": 1.078125,
+      "reward_std": 0.06469383090734482,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 689.0,
+      "completions/max_terminated_length": 689.0,
+      "completions/mean_length": 361.0089416503906,
+      "completions/mean_terminated_length": 361.0089416503906,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 0.2290430745421718,
+      "grad_norm": 0.6693058609962463,
+      "kl": 0.0523681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0271,
+      "num_tokens": 24914732.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.09532838314771652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 795.0,
+      "completions/max_terminated_length": 795.0,
+      "completions/mean_length": 390.8214416503906,
+      "completions/mean_terminated_length": 390.8214416503906,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.23007480010317255,
+      "grad_norm": 0.6660776138305664,
+      "kl": 0.051513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0148,
+      "num_tokens": 25030345.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.10637068003416061,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1259.0,
+      "completions/max_terminated_length": 1259.0,
+      "completions/mean_length": 375.1875305175781,
+      "completions/mean_terminated_length": 375.1875305175781,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.23110652566417333,
+      "grad_norm": 0.5172949433326721,
+      "kl": 0.05389404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0174,
+      "num_tokens": 25146055.0,
+      "reward": 1.0625,
+      "reward_std": 0.0612691231071949,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0625,
+      "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 742.0,
+      "completions/max_terminated_length": 742.0,
+      "completions/mean_length": 340.08038330078125,
+      "completions/mean_terminated_length": 340.08038330078125,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.2321382512251741,
+      "grad_norm": 0.8066679835319519,
+      "kl": 0.0545654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0101,
+      "num_tokens": 25256727.0,
+      "reward": 1.1531251668930054,
+      "reward_std": 0.10807124525308609,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 605.0,
+      "completions/max_terminated_length": 605.0,
+      "completions/mean_length": 329.125,
+      "completions/mean_terminated_length": 329.125,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 0.23316997678617488,
+      "grad_norm": 0.7467411160469055,
+      "kl": 0.05596923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.024,
+      "num_tokens": 25356526.0,
+      "reward": 1.1812502145767212,
+      "reward_std": 0.07197443395853043,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 601.0,
+      "completions/max_terminated_length": 601.0,
+      "completions/mean_length": 351.45538330078125,
+      "completions/mean_terminated_length": 351.45538330078125,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.23420170234717566,
+      "grad_norm": 0.6787609457969666,
+      "kl": 0.0518798828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0156,
+      "num_tokens": 25464321.0,
+      "reward": 1.1750000715255737,
+      "reward_std": 0.09915289282798767,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2101050615310669,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1135.0,
+      "completions/max_terminated_length": 1135.0,
+      "completions/mean_length": 411.58929443359375,
+      "completions/mean_terminated_length": 411.58929443359375,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.23523342790817642,
+      "grad_norm": 0.5712829232215881,
+      "kl": 0.05523681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 25584213.0,
+      "reward": 1.1250001192092896,
+      "reward_std": 0.05161399021744728,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 866.0,
+      "completions/max_terminated_length": 866.0,
+      "completions/mean_length": 351.6785888671875,
+      "completions/mean_terminated_length": 351.6785888671875,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 0.2362651534691772,
+      "grad_norm": 0.6961349844932556,
+      "kl": 0.05596923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 25699635.0,
+      "reward": 1.1160714626312256,
+      "reward_std": 0.08228814601898193,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 599.0,
+      "completions/max_terminated_length": 599.0,
+      "completions/mean_length": 344.2589416503906,
+      "completions/mean_terminated_length": 344.2589416503906,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 0.23729687903017796,
+      "grad_norm": 0.869304895401001,
+      "kl": 0.0552978515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0021,
+      "num_tokens": 25798252.0,
+      "reward": 1.2000001668930054,
+      "reward_std": 0.1284002959728241,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20000001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1022.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 354.4464416503906,
+      "completions/mean_terminated_length": 354.4464416503906,
+      "completions/min_length": 126.0,
+      "completions/min_terminated_length": 126.0,
+      "epoch": 0.23832860459117874,
+      "grad_norm": 0.6629135608673096,
+      "kl": 0.0545654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0183,
+      "num_tokens": 25906970.0,
+      "reward": 1.1375001668930054,
+      "reward_std": 0.08363571017980576,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170318961143494,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 714.0,
+      "completions/max_terminated_length": 714.0,
+      "completions/mean_length": 365.8125305175781,
+      "completions/mean_terminated_length": 365.8125305175781,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.23936033015217953,
+      "grad_norm": 0.6239770650863647,
+      "kl": 0.044677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0205,
+      "num_tokens": 26003752.0,
+      "reward": 1.1312501430511475,
+      "reward_std": 0.07641790807247162,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1034.0,
+      "completions/max_terminated_length": 1034.0,
+      "completions/mean_length": 378.6160888671875,
+      "completions/mean_terminated_length": 378.6160888671875,
+      "completions/min_length": 64.0,
+      "completions/min_terminated_length": 64.0,
+      "epoch": 0.24039205571318029,
+      "grad_norm": 0.6367329955101013,
+      "kl": 0.04547119140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0092,
+      "num_tokens": 26107372.0,
+      "reward": 1.1593750715255737,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15937498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1737.0,
+      "completions/max_terminated_length": 1737.0,
+      "completions/mean_length": 376.6607360839844,
+      "completions/mean_terminated_length": 376.6607360839844,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 0.24142378127418107,
+      "grad_norm": 0.7532415390014648,
+      "kl": 0.0555419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.014,
+      "num_tokens": 26216184.0,
+      "reward": 1.1375000476837158,
+      "reward_std": 0.10359635949134827,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170318961143494,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 894.0,
+      "completions/max_terminated_length": 894.0,
+      "completions/mean_length": 378.1339416503906,
+      "completions/mean_terminated_length": 378.1339416503906,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.24245550683518186,
+      "grad_norm": 0.6156569719314575,
+      "kl": 0.05511474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 26328087.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.06330667436122894,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2234.0,
+      "completions/max_terminated_length": 2234.0,
+      "completions/mean_length": 457.6339416503906,
+      "completions/mean_terminated_length": 457.6339416503906,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.2434872323961826,
+      "grad_norm": 0.5021722316741943,
+      "kl": 0.03857421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0202,
+      "num_tokens": 26451799.0,
+      "reward": 1.078125,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 804.0,
+      "completions/max_terminated_length": 804.0,
+      "completions/mean_length": 408.1071472167969,
+      "completions/mean_terminated_length": 408.1071472167969,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.2445189579571834,
+      "grad_norm": 0.6339530348777771,
+      "kl": 0.04754638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 26573660.0,
+      "reward": 1.0593750476837158,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 932.0,
+      "completions/max_terminated_length": 932.0,
+      "completions/mean_length": 387.4464416503906,
+      "completions/mean_terminated_length": 387.4464416503906,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.24555068351818415,
+      "grad_norm": 0.7284670472145081,
+      "kl": 0.04962158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0168,
+      "num_tokens": 26678038.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.09292246401309967,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1172.0,
+      "completions/max_terminated_length": 1172.0,
+      "completions/mean_length": 408.6071472167969,
+      "completions/mean_terminated_length": 408.6071472167969,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 0.24658240907918494,
+      "grad_norm": 0.5565657615661621,
+      "kl": 0.04766845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 26795447.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.05543848127126694,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 859.0,
+      "completions/max_terminated_length": 859.0,
+      "completions/mean_length": 361.8839416503906,
+      "completions/mean_terminated_length": 361.8839416503906,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 0.24761413464018572,
+      "grad_norm": 0.39315348863601685,
+      "kl": 0.04290771484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 26908687.0,
+      "reward": 1.1218750476837158,
+      "reward_std": 0.027209853753447533,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 714.0,
+      "completions/max_terminated_length": 714.0,
+      "completions/mean_length": 371.4107360839844,
+      "completions/mean_terminated_length": 371.4107360839844,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.24864586020118648,
+      "grad_norm": 0.8473479151725769,
+      "kl": 0.04461669921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 27015406.0,
+      "reward": 1.203125,
+      "reward_std": 0.13080620765686035,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.203125,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1189.0,
+      "completions/max_terminated_length": 1189.0,
+      "completions/mean_length": 395.5357360839844,
+      "completions/mean_terminated_length": 395.5357360839844,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.24967758576218727,
+      "grad_norm": 0.6841160655021667,
+      "kl": 0.0479736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0323,
+      "num_tokens": 27122804.0,
+      "reward": 1.1500002145767212,
+      "reward_std": 0.08468587696552277,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 875.0,
+      "completions/max_terminated_length": 875.0,
+      "completions/mean_length": 453.5982360839844,
+      "completions/mean_terminated_length": 453.5982360839844,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.250709311323188,
+      "grad_norm": 0.6680525541305542,
+      "kl": 0.04119873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 27241542.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.09292246401309967,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1049.0,
+      "completions/max_terminated_length": 1049.0,
+      "completions/mean_length": 442.2410888671875,
+      "completions/mean_terminated_length": 442.2410888671875,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.2517410368841888,
+      "grad_norm": 0.7665314078330994,
+      "kl": 0.04949951171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0443,
+      "num_tokens": 27355144.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.11809477210044861,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 659.0,
+      "completions/max_terminated_length": 659.0,
+      "completions/mean_length": 426.3750305175781,
+      "completions/mean_terminated_length": 426.3750305175781,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.2527727624451896,
+      "grad_norm": 0.5100035071372986,
+      "kl": 0.0462646484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0017,
+      "num_tokens": 27479226.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.06571260839700699,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 758.0,
+      "completions/max_terminated_length": 758.0,
+      "completions/mean_length": 401.5535888671875,
+      "completions/mean_terminated_length": 401.5535888671875,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 0.2538044880061904,
+      "grad_norm": 0.8056138157844543,
+      "kl": 0.05877685546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0042,
+      "num_tokens": 27586836.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.1304064393043518,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437500596046448,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 975.0,
+      "completions/max_terminated_length": 975.0,
+      "completions/mean_length": 422.3125305175781,
+      "completions/mean_terminated_length": 422.3125305175781,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.2548362135671911,
+      "grad_norm": 0.6086869835853577,
+      "kl": 0.05462646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.011,
+      "num_tokens": 27705724.0,
+      "reward": 1.15625,
+      "reward_std": 0.07194302976131439,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.174774631857872,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1083.0,
+      "completions/max_terminated_length": 1083.0,
+      "completions/mean_length": 431.4464416503906,
+      "completions/mean_terminated_length": 431.4464416503906,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.2558679391281919,
+      "grad_norm": 0.7313266396522522,
+      "kl": 0.05499267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 27827280.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.11427027732133865,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 756.0,
+      "completions/max_terminated_length": 756.0,
+      "completions/mean_length": 390.96429443359375,
+      "completions/mean_terminated_length": 390.96429443359375,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.2568996646891927,
+      "grad_norm": 0.6673445701599121,
+      "kl": 0.05084228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0127,
+      "num_tokens": 27931933.0,
+      "reward": 1.0687501430511475,
+      "reward_std": 0.07197443395853043,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 742.0,
+      "completions/max_terminated_length": 742.0,
+      "completions/mean_length": 406.2410888671875,
+      "completions/mean_terminated_length": 406.2410888671875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.25793139025019346,
+      "grad_norm": 0.6612470746040344,
+      "kl": 0.045166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0032,
+      "num_tokens": 28044429.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.08468588441610336,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 250
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 436.52679443359375,
+      "completions/mean_terminated_length": 436.52679443359375,
+      "completions/min_length": 125.0,
+      "completions/min_terminated_length": 125.0,
+      "epoch": 0.25896311581119424,
+      "grad_norm": 0.5871869921684265,
+      "kl": 0.0545654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 28163537.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.07194302976131439,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 251
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 759.0,
+      "completions/max_terminated_length": 759.0,
+      "completions/mean_length": 440.107177734375,
+      "completions/mean_terminated_length": 440.107177734375,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.259994841372195,
+      "grad_norm": 0.5709466338157654,
+      "kl": 0.0615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0264,
+      "num_tokens": 28277264.0,
+      "reward": 1.1250001192092896,
+      "reward_std": 0.05303254351019859,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1249999925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 252
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1209.0,
+      "completions/max_terminated_length": 1209.0,
+      "completions/mean_length": 451.5625305175781,
+      "completions/mean_terminated_length": 451.5625305175781,
+      "completions/min_length": 142.0,
+      "completions/min_terminated_length": 142.0,
+      "epoch": 0.26102656693319576,
+      "grad_norm": 0.4719817638397217,
+      "kl": 0.05352783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0145,
+      "num_tokens": 28392778.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.057444632053375244,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729717910289764,
+      "step": 253
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1399.0,
+      "completions/max_terminated_length": 1399.0,
+      "completions/mean_length": 460.4732360839844,
+      "completions/mean_terminated_length": 460.4732360839844,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.26205829249419654,
+      "grad_norm": 0.6896635293960571,
+      "kl": 0.0528564453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0125,
+      "num_tokens": 28513105.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.1118643507361412,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 254
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 406.7589416503906,
+      "completions/mean_terminated_length": 406.7589416503906,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.2630900180551973,
+      "grad_norm": 0.684826135635376,
+      "kl": 0.0616455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0134,
+      "num_tokens": 28623223.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.09332224726676941,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 255
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1188.0,
+      "completions/max_terminated_length": 1188.0,
+      "completions/mean_length": 529.5982666015625,
+      "completions/mean_terminated_length": 529.5982666015625,
+      "completions/min_length": 292.0,
+      "completions/min_terminated_length": 292.0,
+      "epoch": 0.2641217436161981,
+      "grad_norm": 0.4367036521434784,
+      "kl": 0.047607421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0358,
+      "num_tokens": 28764165.0,
+      "reward": 1.0906250476837158,
+      "reward_std": 0.06228790059685707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 256
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 989.0,
+      "completions/max_terminated_length": 989.0,
+      "completions/mean_length": 399.14288330078125,
+      "completions/mean_terminated_length": 399.14288330078125,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.26515346917719884,
+      "grad_norm": 0.704407274723053,
+      "kl": 0.062744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0264,
+      "num_tokens": 28881347.0,
+      "reward": 1.140625,
+      "reward_std": 0.0981341153383255,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 257
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 796.0,
+      "completions/max_terminated_length": 796.0,
+      "completions/mean_length": 508.76788330078125,
+      "completions/mean_terminated_length": 508.76788330078125,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "epoch": 0.2661851947381996,
+      "grad_norm": 0.4720875918865204,
+      "kl": 0.05352783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 29010702.0,
+      "reward": 1.0343750715255737,
+      "reward_std": 0.057844411581754684,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.03437500074505806,
+      "rewards/curriculum_aware_reward_fn/std": 0.10462959855794907,
+      "step": 258
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 846.0,
+      "completions/max_terminated_length": 846.0,
+      "completions/mean_length": 404.6339416503906,
+      "completions/mean_terminated_length": 404.6339416503906,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.2672169202992004,
+      "grad_norm": 0.732636570930481,
+      "kl": 0.064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0021,
+      "num_tokens": 29114968.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.09634716808795929,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 259
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 854.0,
+      "completions/max_terminated_length": 854.0,
+      "completions/mean_length": 388.58929443359375,
+      "completions/mean_terminated_length": 388.58929443359375,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 0.2682486458602012,
+      "grad_norm": 0.6256258487701416,
+      "kl": 0.08599853515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 29224073.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.0612691268324852,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16249999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 260
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1002.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 407.8482360839844,
+      "completions/mean_terminated_length": 407.8482360839844,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.269280371421202,
+      "grad_norm": 0.6058484315872192,
+      "kl": 0.0609130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0155,
+      "num_tokens": 29335538.0,
+      "reward": 1.1312501430511475,
+      "reward_std": 0.07055586576461792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848,
+      "step": 261
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 782.0,
+      "completions/max_terminated_length": 782.0,
+      "completions/mean_length": 411.2410888671875,
+      "completions/mean_terminated_length": 411.2410888671875,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.2703120969822027,
+      "grad_norm": 0.38458946347236633,
+      "kl": 0.0601806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 29448146.0,
+      "reward": 1.0973215103149414,
+      "reward_std": 0.045989371836185455,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 262
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1122.0,
+      "completions/max_terminated_length": 1122.0,
+      "completions/mean_length": 434.9821472167969,
+      "completions/mean_terminated_length": 434.9821472167969,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.2713438225432035,
+      "grad_norm": 0.6179335117340088,
+      "kl": 0.05865478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0004,
+      "num_tokens": 29564827.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.07536774128675461,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 397.08929443359375,
+      "completions/mean_terminated_length": 397.08929443359375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.2723755481042043,
+      "grad_norm": 0.7425304651260376,
+      "kl": 0.06341552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.007,
+      "num_tokens": 29679042.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.07678630203008652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 264
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 770.0,
+      "completions/max_terminated_length": 770.0,
+      "completions/mean_length": 407.02679443359375,
+      "completions/mean_terminated_length": 407.02679443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.27340727366520506,
+      "grad_norm": 0.6302496790885925,
+      "kl": 0.0601806640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0056,
+      "num_tokens": 29796414.0,
+      "reward": 1.15625,
+      "reward_std": 0.06953710317611694,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.21446822583675385,
+      "step": 265
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 634.0,
+      "completions/max_terminated_length": 634.0,
+      "completions/mean_length": 374.89288330078125,
+      "completions/mean_terminated_length": 374.89288330078125,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 0.27443899922620585,
+      "grad_norm": 0.5936657786369324,
+      "kl": 0.069091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 29897335.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.06370645761489868,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728,
+      "step": 266
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 837.0,
+      "completions/mean_length": 466.9732360839844,
+      "completions/mean_terminated_length": 434.279296875,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 0.2754707247872066,
+      "grad_norm": 0.7581308484077454,
+      "kl": 0.05352783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0481,
+      "num_tokens": 30021750.0,
+      "reward": 1.1348215341567993,
+      "reward_std": 0.13005873560905457,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478,
+      "step": 267
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 628.0,
+      "completions/max_terminated_length": 628.0,
+      "completions/mean_length": 355.8482360839844,
+      "completions/mean_terminated_length": 355.8482360839844,
+      "completions/min_length": 82.0,
+      "completions/min_terminated_length": 82.0,
+      "epoch": 0.27650245034820736,
+      "grad_norm": 0.685746967792511,
+      "kl": 0.06317138671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0234,
+      "num_tokens": 30115862.0,
+      "reward": 1.15625,
+      "reward_std": 0.0733615905046463,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.174774631857872,
+      "step": 268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1830.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 414.9285888671875,
+      "completions/mean_terminated_length": 414.9285888671875,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 0.27753417590920815,
+      "grad_norm": 0.7315962910652161,
+      "kl": 0.0662841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 30218670.0,
+      "reward": 1.09375,
+      "reward_std": 0.1036277636885643,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1188.0,
+      "completions/max_terminated_length": 1188.0,
+      "completions/mean_length": 490.294677734375,
+      "completions/mean_terminated_length": 490.294677734375,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.27856590147020893,
+      "grad_norm": 0.47885870933532715,
+      "kl": 0.048583984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0103,
+      "num_tokens": 30342190.0,
+      "reward": 1.109375238418579,
+      "reward_std": 0.06330667436122894,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 270
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 885.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 363.15179443359375,
+      "completions/mean_terminated_length": 363.15179443359375,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 0.2795976270312097,
+      "grad_norm": 0.4935992360115051,
+      "kl": 0.05731201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0129,
+      "num_tokens": 30449279.0,
+      "reward": 1.2312501668930054,
+      "reward_std": 0.06228789687156677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23125000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 271
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1352.0,
+      "completions/max_terminated_length": 1352.0,
+      "completions/mean_length": 445.70538330078125,
+      "completions/mean_terminated_length": 445.70538330078125,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 0.28062935259221045,
+      "grad_norm": 0.6727720499038696,
+      "kl": 0.0589599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 30572117.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.07678630203008652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 940.0,
+      "completions/max_terminated_length": 940.0,
+      "completions/mean_length": 416.15179443359375,
+      "completions/mean_terminated_length": 416.15179443359375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.28166107815321123,
+      "grad_norm": 0.5469335317611694,
+      "kl": 0.05621337890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0053,
+      "num_tokens": 30683511.0,
+      "reward": 1.1062500476837158,
+      "reward_std": 0.05401992052793503,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 273
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 409.9910888671875,
+      "completions/mean_terminated_length": 409.9910888671875,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.282692803714212,
+      "grad_norm": 0.5514264702796936,
+      "kl": 0.05218505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 30792969.0,
+      "reward": 1.0906251668930054,
+      "reward_std": 0.062287889420986176,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 274
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1041.0,
+      "completions/max_terminated_length": 1041.0,
+      "completions/mean_length": 412.8839416503906,
+      "completions/mean_terminated_length": 412.8839416503906,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.2837245292752128,
+      "grad_norm": 0.8001331090927124,
+      "kl": 0.063720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0049,
+      "num_tokens": 30902044.0,
+      "reward": 1.1531251668930054,
+      "reward_std": 0.08709181845188141,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067,
+      "step": 275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1818.0,
+      "completions/max_terminated_length": 1818.0,
+      "completions/mean_length": 506.9107360839844,
+      "completions/mean_terminated_length": 506.9107360839844,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.2847562548362136,
+      "grad_norm": 0.4141788184642792,
+      "kl": 0.053466796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0102,
+      "num_tokens": 31025654.0,
+      "reward": 1.0781251192092896,
+      "reward_std": 0.053001150488853455,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 276
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1385.0,
+      "completions/max_terminated_length": 1385.0,
+      "completions/mean_length": 471.39288330078125,
+      "completions/mean_terminated_length": 471.39288330078125,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.2857879803972143,
+      "grad_norm": 0.43438559770584106,
+      "kl": 0.0543212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0027,
+      "num_tokens": 31145678.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.05059521645307541,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 277
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 490.40179443359375,
+      "completions/mean_terminated_length": 490.40179443359375,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 0.2868197059582151,
+      "grad_norm": 0.7618850469589233,
+      "kl": 0.04229736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 31270218.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.12738150358200073,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1039.0,
+      "completions/max_terminated_length": 1039.0,
+      "completions/mean_length": 447.732177734375,
+      "completions/mean_terminated_length": 447.732177734375,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.2878514315192159,
+      "grad_norm": 0.6121480464935303,
+      "kl": 0.051025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0003,
+      "num_tokens": 31381159.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.07539913803339005,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 279
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 904.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 415.1160888671875,
+      "completions/mean_terminated_length": 415.1160888671875,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.28888315708021667,
+      "grad_norm": 0.7515974044799805,
+      "kl": 0.0531005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0139,
+      "num_tokens": 31485633.0,
+      "reward": 1.146875023841858,
+      "reward_std": 0.10880802571773529,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 280
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1068.0,
+      "completions/max_terminated_length": 1068.0,
+      "completions/mean_length": 448.65179443359375,
+      "completions/mean_terminated_length": 448.65179443359375,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.28991488264121745,
+      "grad_norm": 0.7135260105133057,
+      "kl": 0.05035400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 31609536.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.087091825902462,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 971.0,
+      "completions/max_terminated_length": 971.0,
+      "completions/mean_length": 443.6785888671875,
+      "completions/mean_terminated_length": 443.6785888671875,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.29094660820221824,
+      "grad_norm": 0.6849467754364014,
+      "kl": 0.05706787109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 31723754.0,
+      "reward": 1.146875023841858,
+      "reward_std": 0.10498353838920593,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 282
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 758.0,
+      "completions/max_terminated_length": 758.0,
+      "completions/mean_length": 428.2500305175781,
+      "completions/mean_terminated_length": 428.2500305175781,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.29197833376321897,
+      "grad_norm": 0.7078130841255188,
+      "kl": 0.05023193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0027,
+      "num_tokens": 31832380.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.10359636694192886,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437500596046448,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 283
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1161.0,
+      "completions/max_terminated_length": 1161.0,
+      "completions/mean_length": 532.8660888671875,
+      "completions/mean_terminated_length": 532.8660888671875,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.29301005932421975,
+      "grad_norm": 0.5811125636100769,
+      "kl": 0.04791259765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0246,
+      "num_tokens": 31966891.0,
+      "reward": 1.0906251668930054,
+      "reward_std": 0.08224855363368988,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 284
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1306.0,
+      "completions/max_terminated_length": 1306.0,
+      "completions/mean_length": 566.25,
+      "completions/mean_terminated_length": 566.25,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.29404178488522054,
+      "grad_norm": 0.36866819858551025,
+      "kl": 0.04547119140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0162,
+      "num_tokens": 32109145.0,
+      "reward": 1.0593751668930054,
+      "reward_std": 0.04130847379565239,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1880.0,
+      "completions/max_terminated_length": 1880.0,
+      "completions/mean_length": 555.8125,
+      "completions/mean_terminated_length": 555.8125,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.2950735104462213,
+      "grad_norm": 0.3248611092567444,
+      "kl": 0.0404052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 32243839.0,
+      "reward": 1.1125000715255737,
+      "reward_std": 0.024803919717669487,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 286
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1022.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 536.2857666015625,
+      "completions/mean_terminated_length": 536.2857666015625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "epoch": 0.2961052360072221,
+      "grad_norm": 0.49759235978126526,
+      "kl": 0.04815673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 32372496.0,
+      "reward": 1.0812501907348633,
+      "reward_std": 0.06469383835792542,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 287
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1278.0,
+      "completions/max_terminated_length": 1278.0,
+      "completions/mean_length": 472.83038330078125,
+      "completions/mean_terminated_length": 472.83038330078125,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.29713696156822283,
+      "grad_norm": 0.5727349519729614,
+      "kl": 0.0496826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 32486038.0,
+      "reward": 1.1598215103149414,
+      "reward_std": 0.11348892003297806,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653,
+      "step": 288
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1318.0,
+      "completions/max_terminated_length": 1318.0,
+      "completions/mean_length": 533.2232666015625,
+      "completions/mean_terminated_length": 533.2232666015625,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.2981686871292236,
+      "grad_norm": 0.5070824027061462,
+      "kl": 0.04302978515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0005,
+      "num_tokens": 32608311.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.07197443395853043,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.20774692296981812,
+      "step": 289
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1372.0,
+      "completions/max_terminated_length": 1372.0,
+      "completions/mean_length": 558.1785888671875,
+      "completions/mean_terminated_length": 558.1785888671875,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.2992004126902244,
+      "grad_norm": 0.5842003226280212,
+      "kl": 0.04473876953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0222,
+      "num_tokens": 32738958.0,
+      "reward": 1.1062500476837158,
+      "reward_std": 0.09430962055921555,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 290
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1037.0,
+      "completions/max_terminated_length": 1037.0,
+      "completions/mean_length": 497.6607360839844,
+      "completions/mean_terminated_length": 497.6607360839844,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.3002321382512252,
+      "grad_norm": 0.46451354026794434,
+      "kl": 0.048583984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0017,
+      "num_tokens": 32860475.0,
+      "reward": 1.1812500953674316,
+      "reward_std": 0.07882384210824966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18125000596046448,
+      "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653,
+      "step": 291
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1110.0,
+      "completions/max_terminated_length": 1110.0,
+      "completions/mean_length": 529.5803833007812,
+      "completions/mean_terminated_length": 529.5803833007812,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 0.301263863812226,
+      "grad_norm": 0.5378194451332092,
+      "kl": 0.0421142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0276,
+      "num_tokens": 32988577.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.0919036790728569,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1236.0,
+      "completions/max_terminated_length": 1236.0,
+      "completions/mean_length": 552.7678833007812,
+      "completions/mean_terminated_length": 552.7678833007812,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.3022955893732267,
+      "grad_norm": 0.562874436378479,
+      "kl": 0.0477294921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0104,
+      "num_tokens": 33121754.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.0919036939740181,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 293
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1673.0,
+      "completions/max_terminated_length": 1673.0,
+      "completions/mean_length": 558.5982666015625,
+      "completions/mean_terminated_length": 558.5982666015625,
+      "completions/min_length": 285.0,
+      "completions/min_terminated_length": 285.0,
+      "epoch": 0.3033273149342275,
+      "grad_norm": 0.5422093868255615,
+      "kl": 0.043212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0103,
+      "num_tokens": 33258496.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.09532838314771652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 294
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1394.0,
+      "completions/max_terminated_length": 1394.0,
+      "completions/mean_length": 595.0535888671875,
+      "completions/mean_terminated_length": 595.0535888671875,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.3043590404952283,
+      "grad_norm": 0.3412728011608124,
+      "kl": 0.0443115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0208,
+      "num_tokens": 33401547.0,
+      "reward": 1.0281250476837158,
+      "reward_std": 0.04371440038084984,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.02812499925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.09557347744703293,
+      "step": 295
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1922.0,
+      "completions/max_terminated_length": 1922.0,
+      "completions/mean_length": 589.7142944335938,
+      "completions/mean_terminated_length": 589.7142944335938,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 0.30539076605622906,
+      "grad_norm": 0.46539467573165894,
+      "kl": 0.038818359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0147,
+      "num_tokens": 33538547.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.06611239165067673,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728,
+      "step": 296
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1245.0,
+      "completions/max_terminated_length": 1245.0,
+      "completions/mean_length": 515.857177734375,
+      "completions/mean_terminated_length": 515.857177734375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 0.30642249161722984,
+      "grad_norm": 0.8021560907363892,
+      "kl": 0.06085205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0044,
+      "num_tokens": 33662025.0,
+      "reward": 1.122321605682373,
+      "reward_std": 0.11312052607536316,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848,
+      "step": 297
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1223.0,
+      "completions/max_terminated_length": 1223.0,
+      "completions/mean_length": 570.482177734375,
+      "completions/mean_terminated_length": 570.482177734375,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 0.30745421717823057,
+      "grad_norm": 0.542922854423523,
+      "kl": 0.049560546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0052,
+      "num_tokens": 33788625.0,
+      "reward": 1.115625023841858,
+      "reward_std": 0.10017166286706924,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 298
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1130.0,
+      "completions/max_terminated_length": 1130.0,
+      "completions/mean_length": 528.2589721679688,
+      "completions/mean_terminated_length": 528.2589721679688,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 0.30848594273923136,
+      "grad_norm": 0.6580304503440857,
+      "kl": 0.07220458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0168,
+      "num_tokens": 33920737.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.08465448766946793,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 299
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2114.0,
+      "completions/max_terminated_length": 2114.0,
+      "completions/mean_length": 544.6517944335938,
+      "completions/mean_terminated_length": 544.6517944335938,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.30951766830023214,
+      "grad_norm": 0.6650287508964539,
+      "kl": 0.051025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 34046726.0,
+      "reward": 1.109375238418579,
+      "reward_std": 0.103996142745018,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.19388526678085327,
+      "step": 300
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1616.0,
+      "completions/max_terminated_length": 1616.0,
+      "completions/mean_length": 582.7232666015625,
+      "completions/mean_terminated_length": 582.7232666015625,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 0.3105493938612329,
+      "grad_norm": 0.527751624584198,
+      "kl": 0.042236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.013,
+      "num_tokens": 34189131.0,
+      "reward": 1.0687501430511475,
+      "reward_std": 0.06268768012523651,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 301
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1237.0,
+      "completions/max_terminated_length": 1237.0,
+      "completions/mean_length": 517.8660888671875,
+      "completions/mean_terminated_length": 517.8660888671875,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.3115811194222337,
+      "grad_norm": 0.6737282872200012,
+      "kl": 0.05023193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0101,
+      "num_tokens": 34321521.0,
+      "reward": 1.0941965579986572,
+      "reward_std": 0.10627111792564392,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 302
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 956.0,
+      "completions/max_terminated_length": 956.0,
+      "completions/mean_length": 510.2857360839844,
+      "completions/mean_terminated_length": 510.2857360839844,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.31261284498323444,
+      "grad_norm": 0.5042452216148376,
+      "kl": 0.05157470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0215,
+      "num_tokens": 34450024.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.07055586576461792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1108.0,
+      "completions/max_terminated_length": 1108.0,
+      "completions/mean_length": 447.5625305175781,
+      "completions/mean_terminated_length": 447.5625305175781,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.3136445705442352,
+      "grad_norm": 0.6584708094596863,
+      "kl": 0.04644775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 34562248.0,
+      "reward": 1.15625,
+      "reward_std": 0.08363571017980576,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.174774631857872,
+      "step": 304
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1137.0,
+      "completions/max_terminated_length": 1137.0,
+      "completions/mean_length": 526.6964721679688,
+      "completions/mean_terminated_length": 526.6964721679688,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 0.314676296105236,
+      "grad_norm": 0.5943373441696167,
+      "kl": 0.0426025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 34691525.0,
+      "reward": 1.125,
+      "reward_std": 0.10017166286706924,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1015.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 535.8660888671875,
+      "completions/mean_terminated_length": 535.8660888671875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.3157080216662368,
+      "grad_norm": 0.6603030562400818,
+      "kl": 0.04278564453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 34825035.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.09674695134162903,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16249999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.21492718160152435,
+      "step": 306
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 453.96429443359375,
+      "completions/mean_terminated_length": 453.96429443359375,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.3167397472272376,
+      "grad_norm": 0.42629313468933105,
+      "kl": 0.04534912109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 34949120.0,
+      "reward": 1.109375,
+      "reward_std": 0.048189278692007065,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 307
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 911.0,
+      "completions/max_terminated_length": 911.0,
+      "completions/mean_length": 425.7232360839844,
+      "completions/mean_terminated_length": 425.7232360839844,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.3177714727882383,
+      "grad_norm": 0.6072869896888733,
+      "kl": 0.04669189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0036,
+      "num_tokens": 35059094.0,
+      "reward": 1.1875,
+      "reward_std": 0.08261694759130478,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 308
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1143.0,
+      "completions/max_terminated_length": 1143.0,
+      "completions/mean_length": 514.5803833007812,
+      "completions/mean_terminated_length": 514.5803833007812,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.3188031983492391,
+      "grad_norm": 0.5873180627822876,
+      "kl": 0.0450439453125,
+      "learning_rate": 1e-06,
+      "loss": 0.017,
+      "num_tokens": 35190052.0,
+      "reward": 1.0906251668930054,
+      "reward_std": 0.10359636694192886,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 948.0,
+      "completions/max_terminated_length": 948.0,
+      "completions/mean_length": 467.1964416503906,
+      "completions/mean_terminated_length": 467.1964416503906,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 0.3198349239102399,
+      "grad_norm": 0.57830411195755,
+      "kl": 0.044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 35307163.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.0856732651591301,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645,
+      "step": 310
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 451.2589416503906,
+      "completions/mean_terminated_length": 451.2589416503906,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 0.32086664947124066,
+      "grad_norm": 0.5586079955101013,
+      "kl": 0.0455322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0125,
+      "num_tokens": 35425695.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.07678630203008652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 311
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1204.0,
+      "completions/max_terminated_length": 1204.0,
+      "completions/mean_length": 437.6875305175781,
+      "completions/mean_terminated_length": 437.6875305175781,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.32189837503224145,
+      "grad_norm": 0.4353128969669342,
+      "kl": 0.0452880859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0006,
+      "num_tokens": 35544163.0,
+      "reward": 1.1593750715255737,
+      "reward_std": 0.043714407831430435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921,
+      "step": 312
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1153.0,
+      "completions/max_terminated_length": 1153.0,
+      "completions/mean_length": 487.2500305175781,
+      "completions/mean_terminated_length": 487.2500305175781,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.3229301005932422,
+      "grad_norm": 0.6254375576972961,
+      "kl": 0.0452880859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0001,
+      "num_tokens": 35669680.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.08949775248765945,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386,
+      "step": 313
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1118.0,
+      "completions/max_terminated_length": 1118.0,
+      "completions/mean_length": 426.1607360839844,
+      "completions/mean_terminated_length": 426.1607360839844,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 0.32396182615424296,
+      "grad_norm": 0.690932035446167,
+      "kl": 0.046630859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0227,
+      "num_tokens": 35782979.0,
+      "reward": 1.1531250476837158,
+      "reward_std": 0.10054004937410355,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067,
+      "step": 314
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1101.0,
+      "completions/max_terminated_length": 1101.0,
+      "completions/mean_length": 464.46429443359375,
+      "completions/mean_terminated_length": 464.46429443359375,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.32499355171524374,
+      "grad_norm": 0.603801429271698,
+      "kl": 0.04901123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.019,
+      "num_tokens": 35903214.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.07678630203008652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 315
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1027.0,
+      "completions/max_terminated_length": 1027.0,
+      "completions/mean_length": 461.4464416503906,
+      "completions/mean_terminated_length": 461.4464416503906,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.32602527727624453,
+      "grad_norm": 0.5353335738182068,
+      "kl": 0.05169677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 36021452.0,
+      "reward": 1.071874976158142,
+      "reward_std": 0.053001150488853455,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 316
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 850.0,
+      "completions/max_terminated_length": 850.0,
+      "completions/mean_length": 448.4910888671875,
+      "completions/mean_terminated_length": 448.4910888671875,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.3270570028372453,
+      "grad_norm": 0.6988222002983093,
+      "kl": 0.0484619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0056,
+      "num_tokens": 36139422.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.0733615905046463,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 317
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 448.58929443359375,
+      "completions/mean_terminated_length": 448.58929443359375,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.32808872839824604,
+      "grad_norm": 0.5886378288269043,
+      "kl": 0.0513916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 36254652.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.07740528881549835,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 318
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 744.0,
+      "completions/max_terminated_length": 744.0,
+      "completions/mean_length": 404.02679443359375,
+      "completions/mean_terminated_length": 404.02679443359375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 0.32912045395924683,
+      "grad_norm": 0.7342789769172668,
+      "kl": 0.0582275390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 36366795.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.08607304841279984,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597,
+      "step": 319
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 457.26788330078125,
+      "completions/mean_terminated_length": 457.26788330078125,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.3301521795202476,
+      "grad_norm": 0.6931236982345581,
+      "kl": 0.04937744140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0035,
+      "num_tokens": 36480680.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.11288312077522278,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 320
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 842.0,
+      "completions/max_terminated_length": 842.0,
+      "completions/mean_length": 397.5089416503906,
+      "completions/mean_terminated_length": 397.5089416503906,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 0.3311839050812484,
+      "grad_norm": 0.5426543354988098,
+      "kl": 0.04840087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0157,
+      "num_tokens": 36585583.0,
+      "reward": 1.1875,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.18152259290218353,
+      "step": 321
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 867.0,
+      "completions/max_terminated_length": 867.0,
+      "completions/mean_length": 462.1785888671875,
+      "completions/mean_terminated_length": 462.1785888671875,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.3322156306422492,
+      "grad_norm": 0.5812787413597107,
+      "kl": 0.05181884765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 36709916.0,
+      "reward": 1.0660713911056519,
+      "reward_std": 0.08255477249622345,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 322
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 731.0,
+      "completions/max_terminated_length": 731.0,
+      "completions/mean_length": 362.1339416503906,
+      "completions/mean_terminated_length": 362.1339416503906,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.3332473562032499,
+      "grad_norm": 0.5366288423538208,
+      "kl": 0.05487060546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0091,
+      "num_tokens": 36814805.0,
+      "reward": 1.2156251668930054,
+      "reward_std": 0.07194303721189499,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2006780505180359,
+      "step": 323
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1474.0,
+      "completions/max_terminated_length": 1474.0,
+      "completions/mean_length": 453.2500305175781,
+      "completions/mean_terminated_length": 453.2500305175781,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.3342790817642507,
+      "grad_norm": 0.4391082227230072,
+      "kl": 0.04986572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0101,
+      "num_tokens": 36939692.0,
+      "reward": 1.0906250476837158,
+      "reward_std": 0.057444632053375244,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 324
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 823.0,
+      "completions/max_terminated_length": 823.0,
+      "completions/mean_length": 446.6160888671875,
+      "completions/mean_terminated_length": 446.6160888671875,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 0.3353108073252515,
+      "grad_norm": 0.4519873857498169,
+      "kl": 0.04400634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.003,
+      "num_tokens": 37062268.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.05161399021744728,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 325
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 868.0,
+      "completions/max_terminated_length": 868.0,
+      "completions/mean_length": 420.3571472167969,
+      "completions/mean_terminated_length": 420.3571472167969,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.33634253288625227,
+      "grad_norm": 0.6282175183296204,
+      "kl": 0.05517578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0215,
+      "num_tokens": 37172824.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.08366710692644119,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 326
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 832.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 426.0446472167969,
+      "completions/mean_terminated_length": 426.0446472167969,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 0.33737425844725305,
+      "grad_norm": 0.7190306186676025,
+      "kl": 0.048095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 37287132.0,
+      "reward": 1.1093751192092896,
+      "reward_std": 0.09674695134162903,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1093749925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 327
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 781.0,
+      "completions/max_terminated_length": 781.0,
+      "completions/mean_length": 426.4285888671875,
+      "completions/mean_terminated_length": 426.4285888671875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.3384059840082538,
+      "grad_norm": 0.6656579971313477,
+      "kl": 0.0509033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0365,
+      "num_tokens": 37400594.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.09572818130254745,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 328
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1045.0,
+      "completions/max_terminated_length": 1045.0,
+      "completions/mean_length": 410.7946472167969,
+      "completions/mean_terminated_length": 410.7946472167969,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 0.33943770956925456,
+      "grad_norm": 0.48774799704551697,
+      "kl": 0.05328369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 37515910.0,
+      "reward": 1.09375,
+      "reward_std": 0.037883758544921875,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569154918193817,
+      "step": 329
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 851.0,
+      "completions/max_terminated_length": 851.0,
+      "completions/mean_length": 409.4107360839844,
+      "completions/mean_terminated_length": 409.4107360839844,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 0.34046943513025535,
+      "grad_norm": 0.3467229902744293,
+      "kl": 0.04986572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0096,
+      "num_tokens": 37633836.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.028228627517819405,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478,
+      "step": 330
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 447.71429443359375,
+      "completions/mean_terminated_length": 447.71429443359375,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.34150116069125613,
+      "grad_norm": 0.6817765831947327,
+      "kl": 0.047607421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0106,
+      "num_tokens": 37750193.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.1166761964559555,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464,
+      "step": 331
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 758.0,
+      "completions/max_terminated_length": 758.0,
+      "completions/mean_length": 474.5625305175781,
+      "completions/mean_terminated_length": 474.5625305175781,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 0.3425328862522569,
+      "grad_norm": 0.523029088973999,
+      "kl": 0.0455322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0182,
+      "num_tokens": 37887736.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.07777367532253265,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 891.0,
+      "completions/max_terminated_length": 891.0,
+      "completions/mean_length": 434.5714416503906,
+      "completions/mean_terminated_length": 434.5714416503906,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.34356461181325765,
+      "grad_norm": 0.7264805436134338,
+      "kl": 0.047607421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0122,
+      "num_tokens": 38011224.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.10600230097770691,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 993.0,
+      "completions/max_terminated_length": 993.0,
+      "completions/mean_length": 473.8839416503906,
+      "completions/mean_terminated_length": 473.8839416503906,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 0.34459633737425843,
+      "grad_norm": 0.45366787910461426,
+      "kl": 0.04718017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0151,
+      "num_tokens": 38137323.0,
+      "reward": 1.0250000953674316,
+      "reward_std": 0.04476457089185715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.02500000037252903,
+      "rewards/curriculum_aware_reward_fn/std": 0.09054389595985413,
+      "step": 334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1048.0,
+      "completions/max_terminated_length": 1048.0,
+      "completions/mean_length": 481.1785888671875,
+      "completions/mean_terminated_length": 481.1785888671875,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.3456280629352592,
+      "grad_norm": 0.304663747549057,
+      "kl": 0.04620361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.006,
+      "num_tokens": 38266486.0,
+      "reward": 1.0250000953674316,
+      "reward_std": 0.024803917855024338,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.02500000037252903,
+      "rewards/curriculum_aware_reward_fn/std": 0.09054390341043472,
+      "step": 335
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1294.0,
+      "completions/max_terminated_length": 1294.0,
+      "completions/mean_length": 418.5535888671875,
+      "completions/mean_terminated_length": 418.5535888671875,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.34665978849626,
+      "grad_norm": 0.7588237524032593,
+      "kl": 0.05780029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 38373186.0,
+      "reward": 1.1531251668930054,
+      "reward_std": 0.11087695509195328,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067,
+      "step": 336
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 684.0,
+      "completions/max_terminated_length": 684.0,
+      "completions/mean_length": 435.4196472167969,
+      "completions/mean_terminated_length": 435.4196472167969,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.3476915140572608,
+      "grad_norm": 0.7718022465705872,
+      "kl": 0.052001953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0044,
+      "num_tokens": 38499065.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.1225382462143898,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 337
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 805.0,
+      "completions/max_terminated_length": 805.0,
+      "completions/mean_length": 429.45538330078125,
+      "completions/mean_terminated_length": 429.45538330078125,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.3487232396182615,
+      "grad_norm": 0.5799484252929688,
+      "kl": 0.04791259765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0081,
+      "num_tokens": 38621035.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.0612691193819046,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 338
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 788.0,
+      "completions/max_terminated_length": 788.0,
+      "completions/mean_length": 433.4821472167969,
+      "completions/mean_terminated_length": 433.4821472167969,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.3497549651792623,
+      "grad_norm": 0.7384994626045227,
+      "kl": 0.0521240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0066,
+      "num_tokens": 38731262.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.08986613154411316,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 339
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1145.0,
+      "completions/max_terminated_length": 1145.0,
+      "completions/mean_length": 432.7500305175781,
+      "completions/mean_terminated_length": 432.7500305175781,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.3507866907402631,
+      "grad_norm": 0.6898339986801147,
+      "kl": 0.04461669921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 38847798.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.0898975357413292,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386,
+      "step": 340
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 792.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 418.8214416503906,
+      "completions/mean_terminated_length": 418.8214416503906,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.35181841630126387,
+      "grad_norm": 0.5710652470588684,
+      "kl": 0.05328369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0003,
+      "num_tokens": 38960093.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 442.3482360839844,
+      "completions/mean_terminated_length": 442.3482360839844,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.35285014186226465,
+      "grad_norm": 0.6075477004051208,
+      "kl": 0.047119140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0016,
+      "num_tokens": 39076675.0,
+      "reward": 1.068750023841858,
+      "reward_std": 0.08363571763038635,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 945.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 399.6339416503906,
+      "completions/mean_terminated_length": 399.6339416503906,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.3538818674232654,
+      "grad_norm": 0.5659580230712891,
+      "kl": 0.057861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0029,
+      "num_tokens": 39185750.0,
+      "reward": 1.1312501430511475,
+      "reward_std": 0.05988196283578873,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848,
+      "step": 343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 532.0,
+      "completions/max_terminated_length": 532.0,
+      "completions/mean_length": 366.3214416503906,
+      "completions/mean_terminated_length": 366.3214416503906,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.35491359298426617,
+      "grad_norm": 0.6755393147468567,
+      "kl": 0.05224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 39287580.0,
+      "reward": 1.1875,
+      "reward_std": 0.08122977614402771,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 344
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 422.0446472167969,
+      "completions/mean_terminated_length": 422.0446472167969,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.35594531854526695,
+      "grad_norm": 0.6563684940338135,
+      "kl": 0.04205322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 39414908.0,
+      "reward": 1.084375023841858,
+      "reward_std": 0.08159816265106201,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 345
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 916.0,
+      "completions/max_terminated_length": 916.0,
+      "completions/mean_length": 420.4732360839844,
+      "completions/mean_terminated_length": 420.4732360839844,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.35697704410626774,
+      "grad_norm": 0.6138877272605896,
+      "kl": 0.05419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 39524665.0,
+      "reward": 1.1379464864730835,
+      "reward_std": 0.09797175228595734,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 346
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 778.0,
+      "completions/max_terminated_length": 778.0,
+      "completions/mean_length": 401.0357360839844,
+      "completions/mean_terminated_length": 401.0357360839844,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.3580087696672685,
+      "grad_norm": 0.7666031718254089,
+      "kl": 0.065185546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0026,
+      "num_tokens": 39637208.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.086041659116745,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 347
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1541.0,
+      "completions/max_terminated_length": 1541.0,
+      "completions/mean_length": 466.33929443359375,
+      "completions/mean_terminated_length": 466.33929443359375,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.3590404952282693,
+      "grad_norm": 0.5586252808570862,
+      "kl": 0.0443115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 39763442.0,
+      "reward": 1.140625238418579,
+      "reward_std": 0.07536774128675461,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 348
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 769.0,
+      "completions/max_terminated_length": 769.0,
+      "completions/mean_length": 407.15179443359375,
+      "completions/mean_terminated_length": 407.15179443359375,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.36007222078927004,
+      "grad_norm": 0.719732940196991,
+      "kl": 0.0467529296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0105,
+      "num_tokens": 39870685.0,
+      "reward": 1.1656252145767212,
+      "reward_std": 0.10359636694192886,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16562499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17553408443927765,
+      "step": 349
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 845.0,
+      "completions/max_terminated_length": 845.0,
+      "completions/mean_length": 421.9464416503906,
+      "completions/mean_terminated_length": 421.9464416503906,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.3611039463502708,
+      "grad_norm": 0.598708987236023,
+      "kl": 0.0479736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0075,
+      "num_tokens": 39988958.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.08949775248765945,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901,
+      "step": 350
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1080.0,
+      "completions/max_terminated_length": 1080.0,
+      "completions/mean_length": 420.8214416503906,
+      "completions/mean_terminated_length": 420.8214416503906,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.3621356719112716,
+      "grad_norm": 0.6411553621292114,
+      "kl": 0.04693603515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 40104039.0,
+      "reward": 1.1375000476837158,
+      "reward_std": 0.08505427092313766,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374,
+      "step": 351
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 398.6339416503906,
+      "completions/mean_terminated_length": 398.6339416503906,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.3631673974722724,
+      "grad_norm": 0.6558811068534851,
+      "kl": 0.0516357421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0033,
+      "num_tokens": 40223796.0,
+      "reward": 1.125,
+      "reward_std": 0.08261694014072418,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 352
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 433.3035888671875,
+      "completions/mean_terminated_length": 433.3035888671875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.3641991230332732,
+      "grad_norm": 0.6025667190551758,
+      "kl": 0.0465087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 40333849.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.05645725876092911,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 353
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 844.0,
+      "completions/max_terminated_length": 844.0,
+      "completions/mean_length": 419.1964416503906,
+      "completions/mean_terminated_length": 419.1964416503906,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.3652308485942739,
+      "grad_norm": 0.6397855281829834,
+      "kl": 0.0496826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0166,
+      "num_tokens": 40460648.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.08363571763038635,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729717910289764,
+      "step": 354
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 442.5714416503906,
+      "completions/mean_terminated_length": 442.5714416503906,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 0.3662625741552747,
+      "grad_norm": 0.6532189249992371,
+      "kl": 0.0506591796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0114,
+      "num_tokens": 40579999.0,
+      "reward": 1.125000238418579,
+      "reward_std": 0.08607304841279984,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1249999925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845881938934326,
+      "step": 355
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 814.0,
+      "completions/max_terminated_length": 814.0,
+      "completions/mean_length": 414.6875305175781,
+      "completions/mean_terminated_length": 414.6875305175781,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.3672942997162755,
+      "grad_norm": 0.4674402177333832,
+      "kl": 0.0479736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 40697165.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.05198238044977188,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 356
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 734.0,
+      "completions/max_terminated_length": 734.0,
+      "completions/mean_length": 442.8214416503906,
+      "completions/mean_terminated_length": 442.8214416503906,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.36832602527727626,
+      "grad_norm": 0.6097815036773682,
+      "kl": 0.04449462890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 40815214.0,
+      "reward": 1.0656250715255737,
+      "reward_std": 0.07539913058280945,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.13722331821918488,
+      "step": 357
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 730.0,
+      "completions/max_terminated_length": 730.0,
+      "completions/mean_length": 422.1964416503906,
+      "completions/mean_terminated_length": 422.1964416503906,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 0.36935775083827704,
+      "grad_norm": 0.7170711755752563,
+      "kl": 0.04144287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 40934663.0,
+      "reward": 1.1348215341567993,
+      "reward_std": 0.11864346265792847,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597,
+      "step": 358
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 850.0,
+      "completions/max_terminated_length": 850.0,
+      "completions/mean_length": 410.08038330078125,
+      "completions/mean_terminated_length": 410.08038330078125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.3703894763992778,
+      "grad_norm": 0.6699801683425903,
+      "kl": 0.0465087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0027,
+      "num_tokens": 41050795.0,
+      "reward": 1.1531251668930054,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067,
+      "step": 359
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 887.0,
+      "completions/max_terminated_length": 887.0,
+      "completions/mean_length": 436.4821472167969,
+      "completions/mean_terminated_length": 436.4821472167969,
+      "completions/min_length": 145.0,
+      "completions/min_terminated_length": 145.0,
+      "epoch": 0.37142120196027856,
+      "grad_norm": 0.6288172006607056,
+      "kl": 0.05291748046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0039,
+      "num_tokens": 41168393.0,
+      "reward": 1.087499976158142,
+      "reward_std": 0.08224854618310928,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 360
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1025.0,
+      "completions/max_terminated_length": 1025.0,
+      "completions/mean_length": 444.3214416503906,
+      "completions/mean_terminated_length": 444.3214416503906,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.37245292752127934,
+      "grad_norm": 0.5419376492500305,
+      "kl": 0.04608154296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 41291272.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 361
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 786.0,
+      "completions/max_terminated_length": 786.0,
+      "completions/mean_length": 395.21429443359375,
+      "completions/mean_terminated_length": 395.21429443359375,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.3734846530822801,
+      "grad_norm": 0.6662964224815369,
+      "kl": 0.0462646484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0034,
+      "num_tokens": 41401840.0,
+      "reward": 1.125,
+      "reward_std": 0.09252267330884933,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 362
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 942.0,
+      "completions/max_terminated_length": 942.0,
+      "completions/mean_length": 442.982177734375,
+      "completions/mean_terminated_length": 442.982177734375,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 0.3745163786432809,
+      "grad_norm": 0.7266864776611328,
+      "kl": 0.0445556640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 41528853.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.08505427092313766,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267,
+      "step": 363
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1096.0,
+      "completions/max_terminated_length": 1096.0,
+      "completions/mean_length": 421.4821472167969,
+      "completions/mean_terminated_length": 421.4821472167969,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.37554810420428164,
+      "grad_norm": 0.6591650247573853,
+      "kl": 0.05230712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 41637421.0,
+      "reward": 1.115625023841858,
+      "reward_std": 0.0860416442155838,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 364
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1655.0,
+      "completions/max_terminated_length": 1655.0,
+      "completions/mean_length": 468.1607360839844,
+      "completions/mean_terminated_length": 468.1607360839844,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.3765798297652824,
+      "grad_norm": 0.6517752408981323,
+      "kl": 0.0452880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0157,
+      "num_tokens": 41759204.0,
+      "reward": 1.09375,
+      "reward_std": 0.08363571017980576,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056,
+      "step": 365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 665.0,
+      "completions/max_terminated_length": 665.0,
+      "completions/mean_length": 388.4910888671875,
+      "completions/mean_terminated_length": 388.4910888671875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.3776115553262832,
+      "grad_norm": 0.6884093880653381,
+      "kl": 0.04962158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0036,
+      "num_tokens": 41866282.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.09918428212404251,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.20724830031394958,
+      "step": 366
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 806.0,
+      "completions/max_terminated_length": 806.0,
+      "completions/mean_length": 403.2857360839844,
+      "completions/mean_terminated_length": 403.2857360839844,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 0.378643280887284,
+      "grad_norm": 0.60127854347229,
+      "kl": 0.04571533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0085,
+      "num_tokens": 41973687.0,
+      "reward": 1.1250001192092896,
+      "reward_std": 0.0798112154006958,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 367
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 410.4910888671875,
+      "completions/mean_terminated_length": 410.4910888671875,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.3796750064482848,
+      "grad_norm": 0.6465752720832825,
+      "kl": 0.0518798828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0289,
+      "num_tokens": 42083173.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.08949775993824005,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.1893601268529892,
+      "step": 368
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1342.0,
+      "completions/max_terminated_length": 1342.0,
+      "completions/mean_length": 394.1696472167969,
+      "completions/mean_terminated_length": 394.1696472167969,
+      "completions/min_length": 119.0,
+      "completions/min_terminated_length": 119.0,
+      "epoch": 0.3807067320092855,
+      "grad_norm": 0.7463873624801636,
+      "kl": 0.05291748046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0023,
+      "num_tokens": 42187520.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.10257759690284729,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478,
+      "step": 369
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 808.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 434.0000305175781,
+      "completions/mean_terminated_length": 434.0000305175781,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 0.3817384575702863,
+      "grad_norm": 0.7016083598136902,
+      "kl": 0.059326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0265,
+      "num_tokens": 42313065.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.10297737270593643,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.17820820212364197,
+      "step": 370
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 830.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 412.64288330078125,
+      "completions/mean_terminated_length": 412.64288330078125,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.3827701831312871,
+      "grad_norm": 0.6370155811309814,
+      "kl": 0.0504150390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0026,
+      "num_tokens": 42424458.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.08326732367277145,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 371
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1306.0,
+      "completions/max_terminated_length": 1306.0,
+      "completions/mean_length": 379.7946472167969,
+      "completions/mean_terminated_length": 379.7946472167969,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.38380190869228786,
+      "grad_norm": 0.7614575624465942,
+      "kl": 0.0594482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0085,
+      "num_tokens": 42538150.0,
+      "reward": 1.1812502145767212,
+      "reward_std": 0.09532840549945831,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.21520207822322845,
+      "step": 372
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1313.0,
+      "completions/max_terminated_length": 1313.0,
+      "completions/mean_length": 464.3839416503906,
+      "completions/mean_terminated_length": 464.3839416503906,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.38483363425328865,
+      "grad_norm": 0.4783412516117096,
+      "kl": 0.04736328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0024,
+      "num_tokens": 42657277.0,
+      "reward": 1.1125000715255737,
+      "reward_std": 0.0660809874534607,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 373
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 679.0,
+      "completions/max_terminated_length": 679.0,
+      "completions/mean_length": 404.8035888671875,
+      "completions/mean_terminated_length": 404.8035888671875,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 0.3858653598142894,
+      "grad_norm": 0.5575130581855774,
+      "kl": 0.05108642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 42764404.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.0671311616897583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 374
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 421.3660888671875,
+      "completions/mean_terminated_length": 421.3660888671875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.38689708537529016,
+      "grad_norm": 0.7298808097839355,
+      "kl": 0.050048828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0233,
+      "num_tokens": 42874625.0,
+      "reward": 1.1316965818405151,
+      "reward_std": 0.10944001376628876,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 375
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 780.0,
+      "completions/max_terminated_length": 780.0,
+      "completions/mean_length": 387.9196472167969,
+      "completions/mean_terminated_length": 387.9196472167969,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 0.38792881093629095,
+      "grad_norm": 0.5968577265739441,
+      "kl": 0.05230712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0209,
+      "num_tokens": 42991649.0,
+      "reward": 1.1531251668930054,
+      "reward_std": 0.07055586576461792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067,
+      "step": 376
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 716.0,
+      "completions/max_terminated_length": 716.0,
+      "completions/mean_length": 423.77679443359375,
+      "completions/mean_terminated_length": 423.77679443359375,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.38896053649729173,
+      "grad_norm": 0.5788670778274536,
+      "kl": 0.04803466796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0011,
+      "num_tokens": 43110316.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.07333019375801086,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 377
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 598.0,
+      "completions/max_terminated_length": 598.0,
+      "completions/mean_length": 373.6875305175781,
+      "completions/mean_terminated_length": 373.6875305175781,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 0.3899922620582925,
+      "grad_norm": 0.7469679713249207,
+      "kl": 0.0487060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.032,
+      "num_tokens": 43217570.0,
+      "reward": 1.171875,
+      "reward_std": 0.1025775894522667,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.175758495926857,
+      "step": 378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 821.0,
+      "completions/max_terminated_length": 821.0,
+      "completions/mean_length": 415.3482360839844,
+      "completions/mean_terminated_length": 415.3482360839844,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 0.39102398761929325,
+      "grad_norm": 0.4806562662124634,
+      "kl": 0.0462646484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 43334964.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.053001150488853455,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645,
+      "step": 379
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 678.0,
+      "completions/max_terminated_length": 678.0,
+      "completions/mean_length": 386.6607360839844,
+      "completions/mean_terminated_length": 386.6607360839844,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.39205571318029403,
+      "grad_norm": 0.6967973113059998,
+      "kl": 0.04705810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 43446520.0,
+      "reward": 1.1191965341567993,
+      "reward_std": 0.09760335832834244,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.1878974735736847,
+      "step": 380
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 643.0,
+      "completions/max_terminated_length": 643.0,
+      "completions/mean_length": 383.1964416503906,
+      "completions/mean_terminated_length": 383.1964416503906,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 0.3930874387412948,
+      "grad_norm": 0.5589176416397095,
+      "kl": 0.05255126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0048,
+      "num_tokens": 43557282.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.0612691231071949,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 381
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 847.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 348.3482360839844,
+      "completions/mean_terminated_length": 348.3482360839844,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "epoch": 0.3941191643022956,
+      "grad_norm": 0.6366583108901978,
+      "kl": 0.049072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0249,
+      "num_tokens": 43657960.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.08122977614402771,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478,
+      "step": 382
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 788.0,
+      "completions/max_terminated_length": 788.0,
+      "completions/mean_length": 442.58038330078125,
+      "completions/mean_terminated_length": 442.58038330078125,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.3951508898632964,
+      "grad_norm": 0.5829880833625793,
+      "kl": 0.04364013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0056,
+      "num_tokens": 43777165.0,
+      "reward": 1.0437501668930054,
+      "reward_std": 0.07678630203008652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.04374999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.11627185344696045,
+      "step": 383
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 836.0,
+      "completions/max_terminated_length": 836.0,
+      "completions/mean_length": 418.8750305175781,
+      "completions/mean_terminated_length": 418.8750305175781,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 0.3961826154242971,
+      "grad_norm": 0.5630151033401489,
+      "kl": 0.05035400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 43896165.0,
+      "reward": 1.1125000715255737,
+      "reward_std": 0.07536774128675461,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 384
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1048.0,
+      "completions/max_terminated_length": 1048.0,
+      "completions/mean_length": 396.4375305175781,
+      "completions/mean_terminated_length": 396.4375305175781,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 0.3972143409852979,
+      "grad_norm": 0.6741788983345032,
+      "kl": 0.0452880859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0044,
+      "num_tokens": 44004406.0,
+      "reward": 1.21875,
+      "reward_std": 0.07194302976131439,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21875,
+      "rewards/curriculum_aware_reward_fn/std": 0.21076062321662903,
+      "step": 385
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1025.0,
+      "completions/max_terminated_length": 1025.0,
+      "completions/mean_length": 432.4285888671875,
+      "completions/mean_terminated_length": 432.4285888671875,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.3982460665462987,
+      "grad_norm": 0.6056326031684875,
+      "kl": 0.04156494140625,
+      "learning_rate": 1e-06,
+      "loss": 0.014,
+      "num_tokens": 44116167.0,
+      "reward": 1.109375,
+      "reward_std": 0.08986614644527435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1093749925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 386
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 710.0,
+      "completions/max_terminated_length": 710.0,
+      "completions/mean_length": 388.2321472167969,
+      "completions/mean_terminated_length": 388.2321472167969,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.39927779210729947,
+      "grad_norm": 0.7883094549179077,
+      "kl": 0.04345703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0217,
+      "num_tokens": 44229866.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.09332224726676941,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 387
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 893.0,
+      "completions/max_terminated_length": 893.0,
+      "completions/mean_length": 469.39288330078125,
+      "completions/mean_terminated_length": 469.39288330078125,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.40030951766830025,
+      "grad_norm": 0.6451113224029541,
+      "kl": 0.04473876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0096,
+      "num_tokens": 44351031.0,
+      "reward": 1.109375,
+      "reward_std": 0.10396476089954376,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 388
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 384.70538330078125,
+      "completions/mean_terminated_length": 384.70538330078125,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.401341243229301,
+      "grad_norm": 0.5290384888648987,
+      "kl": 0.0445556640625,
+      "learning_rate": 1e-06,
+      "loss": -0.005,
+      "num_tokens": 44461283.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.05682564526796341,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729717910289764,
+      "step": 389
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 406.6607360839844,
+      "completions/mean_terminated_length": 406.6607360839844,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.40237296879030177,
+      "grad_norm": 0.7036343216896057,
+      "kl": 0.04498291015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 44579237.0,
+      "reward": 1.1312501430511475,
+      "reward_std": 0.10017165541648865,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13125000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848,
+      "step": 390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 992.0,
+      "completions/max_terminated_length": 992.0,
+      "completions/mean_length": 452.5982360839844,
+      "completions/mean_terminated_length": 452.5982360839844,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.40340469435130255,
+      "grad_norm": 0.6781396865844727,
+      "kl": 0.045166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0024,
+      "num_tokens": 44694398.0,
+      "reward": 1.125,
+      "reward_std": 0.07398058474063873,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845881938934326,
+      "step": 391
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 902.0,
+      "completions/max_terminated_length": 902.0,
+      "completions/mean_length": 434.8750305175781,
+      "completions/mean_terminated_length": 434.8750305175781,
+      "completions/min_length": 127.0,
+      "completions/min_terminated_length": 127.0,
+      "epoch": 0.40443641991230334,
+      "grad_norm": 0.5618754029273987,
+      "kl": 0.04437255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.012,
+      "num_tokens": 44814735.0,
+      "reward": 1.084375023841858,
+      "reward_std": 0.07536774128675461,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 392
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 435.3214416503906,
+      "completions/mean_terminated_length": 402.34234619140625,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.4054681454733041,
+      "grad_norm": 0.5044761896133423,
+      "kl": 0.0457763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0414,
+      "num_tokens": 44926540.0,
+      "reward": 1.1254465579986572,
+      "reward_std": 0.07346688210964203,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 393
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 639.0,
+      "completions/max_terminated_length": 639.0,
+      "completions/mean_length": 364.8125305175781,
+      "completions/mean_terminated_length": 364.8125305175781,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 0.40649987103430485,
+      "grad_norm": 0.6403542160987854,
+      "kl": 0.04803466796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0018,
+      "num_tokens": 45029475.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.07194302976131439,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 659.0,
+      "completions/max_terminated_length": 659.0,
+      "completions/mean_length": 370.83929443359375,
+      "completions/mean_terminated_length": 370.83929443359375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.40753159659530563,
+      "grad_norm": 0.6687615513801575,
+      "kl": 0.054443359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 45142542.0,
+      "reward": 1.1125000715255737,
+      "reward_std": 0.08122977614402771,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11249999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 395
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1145.0,
+      "completions/max_terminated_length": 1145.0,
+      "completions/mean_length": 417.45538330078125,
+      "completions/mean_terminated_length": 417.45538330078125,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.4085633221563064,
+      "grad_norm": 0.5757293701171875,
+      "kl": 0.049072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0131,
+      "num_tokens": 45262853.0,
+      "reward": 1.1593750715255737,
+      "reward_std": 0.07333019375801086,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15937498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921,
+      "step": 396
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 886.0,
+      "completions/max_terminated_length": 886.0,
+      "completions/mean_length": 379.33038330078125,
+      "completions/mean_terminated_length": 379.33038330078125,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.4095950477173072,
+      "grad_norm": 0.6369082927703857,
+      "kl": 0.04974365234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0135,
+      "num_tokens": 45373413.0,
+      "reward": 1.1593750715255737,
+      "reward_std": 0.06851832568645477,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15937498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921,
+      "step": 397
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 739.0,
+      "completions/max_terminated_length": 739.0,
+      "completions/mean_length": 394.9375305175781,
+      "completions/mean_terminated_length": 394.9375305175781,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.410626773278308,
+      "grad_norm": 0.5108514428138733,
+      "kl": 0.0435791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 45486106.0,
+      "reward": 1.068750023841858,
+      "reward_std": 0.058463405817747116,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 398
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 933.0,
+      "completions/max_terminated_length": 933.0,
+      "completions/mean_length": 412.3660888671875,
+      "completions/mean_terminated_length": 412.3660888671875,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 0.4116584988393087,
+      "grad_norm": 0.6662240624427795,
+      "kl": 0.05010986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0124,
+      "num_tokens": 45593408.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.08949775993824005,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 399
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1019.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 415.9821472167969,
+      "completions/mean_terminated_length": 415.9821472167969,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.4126902244003095,
+      "grad_norm": 0.5813707709312439,
+      "kl": 0.051513671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0084,
+      "num_tokens": 45706981.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.07194302976131439,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 400
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1422.0,
+      "completions/max_terminated_length": 1422.0,
+      "completions/mean_length": 455.357177734375,
+      "completions/mean_terminated_length": 455.357177734375,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 0.4137219499613103,
+      "grad_norm": 0.5764865875244141,
+      "kl": 0.04718017578125,
+      "learning_rate": 1e-06,
+      "loss": -0.004,
+      "num_tokens": 45838670.0,
+      "reward": 1.1125001907348633,
+      "reward_std": 0.06611239165067673,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11249998956918716,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 401
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 556.0,
+      "completions/max_terminated_length": 556.0,
+      "completions/mean_length": 360.45538330078125,
+      "completions/mean_terminated_length": 360.45538330078125,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 0.41475367552231107,
+      "grad_norm": 0.6881871819496155,
+      "kl": 0.052978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 45948992.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.07780507206916809,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645,
+      "step": 402
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 964.0,
+      "completions/max_terminated_length": 964.0,
+      "completions/mean_length": 426.0714416503906,
+      "completions/mean_terminated_length": 426.0714416503906,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 0.41578540108331186,
+      "grad_norm": 0.6441931128501892,
+      "kl": 0.04095458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.024,
+      "num_tokens": 46063625.0,
+      "reward": 1.1656250953674316,
+      "reward_std": 0.10079064220190048,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16562499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17553408443927765,
+      "step": 403
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 894.0,
+      "completions/max_terminated_length": 894.0,
+      "completions/mean_length": 416.77679443359375,
+      "completions/mean_terminated_length": 416.77679443359375,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 0.4168171266443126,
+      "grad_norm": 0.5703777074813843,
+      "kl": 0.04595947265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0026,
+      "num_tokens": 46175761.0,
+      "reward": 1.09375,
+      "reward_std": 0.06851832568645477,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 404
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 720.0,
+      "completions/max_terminated_length": 720.0,
+      "completions/mean_length": 399.0000305175781,
+      "completions/mean_terminated_length": 399.0000305175781,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 0.41784885220531337,
+      "grad_norm": 0.6118718385696411,
+      "kl": 0.05035400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0049,
+      "num_tokens": 46293018.0,
+      "reward": 1.100000023841858,
+      "reward_std": 0.08706042170524597,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464,
+      "step": 405
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 913.0,
+      "completions/max_terminated_length": 913.0,
+      "completions/mean_length": 433.4285888671875,
+      "completions/mean_terminated_length": 433.4285888671875,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 0.41888057776631415,
+      "grad_norm": 0.5196507573127747,
+      "kl": 0.05023193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 46419367.0,
+      "reward": 1.1031250953674316,
+      "reward_std": 0.07842406630516052,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063,
+      "step": 406
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 383.96429443359375,
+      "completions/mean_terminated_length": 383.96429443359375,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 0.41991230332731494,
+      "grad_norm": 0.5900987386703491,
+      "kl": 0.0543212890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0019,
+      "num_tokens": 46535521.0,
+      "reward": 1.118749976158142,
+      "reward_std": 0.0612691231071949,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847,
+      "step": 407
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 931.0,
+      "completions/max_terminated_length": 931.0,
+      "completions/mean_length": 380.39288330078125,
+      "completions/mean_terminated_length": 380.39288330078125,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 0.4209440288883157,
+      "grad_norm": 0.8150759935379028,
+      "kl": 0.049560546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0247,
+      "num_tokens": 46639949.0,
+      "reward": 1.1750000715255737,
+      "reward_std": 0.11532045155763626,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17578651010990143,
+      "step": 408
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 678.0,
+      "completions/max_terminated_length": 678.0,
+      "completions/mean_length": 399.0357360839844,
+      "completions/mean_terminated_length": 399.0357360839844,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 0.4219757544493165,
+      "grad_norm": 0.5155802369117737,
+      "kl": 0.05224609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0023,
+      "num_tokens": 46756118.0,
+      "reward": 1.1375001668930054,
+      "reward_std": 0.049576446413993835,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170318961143494,
+      "step": 409
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1088.0,
+      "completions/max_terminated_length": 1088.0,
+      "completions/mean_length": 467.6607360839844,
+      "completions/mean_terminated_length": 467.6607360839844,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.42300748001031724,
+      "grad_norm": 0.33023545145988464,
+      "kl": 0.04486083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0056,
+      "num_tokens": 46890669.0,
+      "reward": 1.0406250953674316,
+      "reward_std": 0.028228627517819405,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.04062500223517418,
+      "rewards/curriculum_aware_reward_fn/std": 0.11261254549026489,
+      "step": 410
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 945.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 419.0625305175781,
+      "completions/mean_terminated_length": 419.0625305175781,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.424039205571318,
+      "grad_norm": 0.6532723903656006,
+      "kl": 0.04840087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 47002921.0,
+      "reward": 1.1093751192092896,
+      "reward_std": 0.06509362161159515,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 411
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 932.0,
+      "completions/max_terminated_length": 932.0,
+      "completions/mean_length": 448.02679443359375,
+      "completions/mean_terminated_length": 448.02679443359375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.4250709311323188,
+      "grad_norm": 0.7180759310722351,
+      "kl": 0.04180908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0183,
+      "num_tokens": 47114567.0,
+      "reward": 1.1218750476837158,
+      "reward_std": 0.08949775993824005,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 412
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 410.9464416503906,
+      "completions/mean_terminated_length": 410.9464416503906,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.4261026566933196,
+      "grad_norm": 0.7026389837265015,
+      "kl": 0.05511474609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0052,
+      "num_tokens": 47221150.0,
+      "reward": 1.0625,
+      "reward_std": 0.08746020495891571,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0625,
+      "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813,
+      "step": 413
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 640.0,
+      "completions/max_terminated_length": 640.0,
+      "completions/mean_length": 392.1160888671875,
+      "completions/mean_terminated_length": 392.1160888671875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.4271343822543204,
+      "grad_norm": 0.6177908778190613,
+      "kl": 0.05438232421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0052,
+      "num_tokens": 47332386.0,
+      "reward": 1.0687501430511475,
+      "reward_std": 0.06851831823587418,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 414
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 916.0,
+      "completions/max_terminated_length": 916.0,
+      "completions/mean_length": 415.9285888671875,
+      "completions/mean_terminated_length": 415.9285888671875,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.4281661078153211,
+      "grad_norm": 0.6898444890975952,
+      "kl": 0.05059814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0142,
+      "num_tokens": 47435460.0,
+      "reward": 1.1375001668930054,
+      "reward_std": 0.08261694759130478,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374,
+      "step": 415
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1771.0,
+      "completions/max_terminated_length": 1771.0,
+      "completions/mean_length": 506.02679443359375,
+      "completions/mean_terminated_length": 506.02679443359375,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.4291978333763219,
+      "grad_norm": 0.42713573575019836,
+      "kl": 0.03955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0074,
+      "num_tokens": 47563752.0,
+      "reward": 1.0973215103149414,
+      "reward_std": 0.07421800494194031,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 416
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1040.0,
+      "completions/max_terminated_length": 1040.0,
+      "completions/mean_length": 351.3482360839844,
+      "completions/mean_terminated_length": 351.3482360839844,
+      "completions/min_length": 145.0,
+      "completions/min_terminated_length": 145.0,
+      "epoch": 0.4302295589373227,
+      "grad_norm": 0.7151590585708618,
+      "kl": 0.0460205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 47663464.0,
+      "reward": 1.2000000476837158,
+      "reward_std": 0.07882384210824966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20000000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 417
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 886.0,
+      "completions/max_terminated_length": 886.0,
+      "completions/mean_length": 401.89288330078125,
+      "completions/mean_terminated_length": 401.89288330078125,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.43126128449832346,
+      "grad_norm": 0.5175272822380066,
+      "kl": 0.04498291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 47777069.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.0550387017428875,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833,
+      "step": 418
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 964.0,
+      "completions/max_terminated_length": 964.0,
+      "completions/mean_length": 351.7589416503906,
+      "completions/mean_terminated_length": 351.7589416503906,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.43229301005932425,
+      "grad_norm": 0.703593373298645,
+      "kl": 0.05059814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0221,
+      "num_tokens": 47876050.0,
+      "reward": 1.0687501430511475,
+      "reward_std": 0.10359635949134827,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235,
+      "step": 419
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 360.8660888671875,
+      "completions/mean_terminated_length": 360.8660888671875,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 0.433324735620325,
+      "grad_norm": 0.35815978050231934,
+      "kl": 0.05181884765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 47988007.0,
+      "reward": 1.109375,
+      "reward_std": 0.03063456155359745,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1093749925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 420
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 363.7321472167969,
+      "completions/mean_terminated_length": 363.7321472167969,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.43435646118132576,
+      "grad_norm": 0.557057797908783,
+      "kl": 0.05450439453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0072,
+      "num_tokens": 48093068.0,
+      "reward": 1.140625,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 421
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 794.0,
+      "completions/max_terminated_length": 794.0,
+      "completions/mean_length": 416.45538330078125,
+      "completions/mean_terminated_length": 416.45538330078125,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.43538818674232654,
+      "grad_norm": 0.692194938659668,
+      "kl": 0.053466796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 48214303.0,
+      "reward": 1.0781251192092896,
+      "reward_std": 0.08224855363368988,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 422
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1327.0,
+      "completions/max_terminated_length": 1327.0,
+      "completions/mean_length": 408.3750305175781,
+      "completions/mean_terminated_length": 408.3750305175781,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.43641991230332733,
+      "grad_norm": 0.7828994989395142,
+      "kl": 0.0457763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0044,
+      "num_tokens": 48330979.0,
+      "reward": 1.109375,
+      "reward_std": 0.09332224726676941,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1093749925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 423
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 881.0,
+      "completions/max_terminated_length": 881.0,
+      "completions/mean_length": 377.1071472167969,
+      "completions/mean_terminated_length": 377.1071472167969,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 0.4374516378643281,
+      "grad_norm": 0.6373588442802429,
+      "kl": 0.0523681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0019,
+      "num_tokens": 48437398.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.09292246401309967,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 424
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 754.0,
+      "completions/max_terminated_length": 754.0,
+      "completions/mean_length": 375.3214416503906,
+      "completions/mean_terminated_length": 375.3214416503906,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 0.43848336342532884,
+      "grad_norm": 0.7247622013092041,
+      "kl": 0.05340576171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0036,
+      "num_tokens": 48539928.0,
+      "reward": 1.134374976158142,
+      "reward_std": 0.10119043290615082,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 425
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 653.0,
+      "completions/max_terminated_length": 653.0,
+      "completions/mean_length": 347.1785888671875,
+      "completions/mean_terminated_length": 347.1785888671875,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.4395150889863296,
+      "grad_norm": 0.6451843976974487,
+      "kl": 0.07763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 48639782.0,
+      "reward": 1.137946605682373,
+      "reward_std": 0.09939030557870865,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 426
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 725.0,
+      "completions/max_terminated_length": 725.0,
+      "completions/mean_length": 407.8571472167969,
+      "completions/mean_terminated_length": 407.8571472167969,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.4405468145473304,
+      "grad_norm": 0.558600902557373,
+      "kl": 0.04937744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 48755301.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.06472522765398026,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 427
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 366.6875305175781,
+      "completions/mean_terminated_length": 366.6875305175781,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 0.4415785401083312,
+      "grad_norm": 0.6736543774604797,
+      "kl": 0.0601806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 48857872.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 428
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 591.0,
+      "completions/max_terminated_length": 591.0,
+      "completions/mean_length": 312.3571472167969,
+      "completions/mean_terminated_length": 312.3571472167969,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 0.442610265669332,
+      "grad_norm": 0.38957372307777405,
+      "kl": 0.063720703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0027,
+      "num_tokens": 48957844.0,
+      "reward": 1.1968752145767212,
+      "reward_std": 0.027209853753447533,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067,
+      "step": 429
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 896.0,
+      "completions/max_terminated_length": 896.0,
+      "completions/mean_length": 382.4196472167969,
+      "completions/mean_terminated_length": 382.4196472167969,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.4436419912303327,
+      "grad_norm": 0.6348445415496826,
+      "kl": 0.0562744140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 49072579.0,
+      "reward": 1.140625,
+      "reward_std": 0.07398059219121933,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 430
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 762.0,
+      "completions/max_terminated_length": 762.0,
+      "completions/mean_length": 364.08929443359375,
+      "completions/mean_terminated_length": 364.08929443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.4446737167913335,
+      "grad_norm": 0.6346526741981506,
+      "kl": 0.06121826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0004,
+      "num_tokens": 49181859.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.0671311616897583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645,
+      "step": 431
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 762.0,
+      "completions/max_terminated_length": 762.0,
+      "completions/mean_length": 357.46429443359375,
+      "completions/mean_terminated_length": 357.46429443359375,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.4457054423523343,
+      "grad_norm": 0.5235685706138611,
+      "kl": 0.0638427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0001,
+      "num_tokens": 49284629.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.05059521645307541,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 834.0,
+      "completions/max_terminated_length": 834.0,
+      "completions/mean_length": 389.2232360839844,
+      "completions/mean_terminated_length": 389.2232360839844,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.44673716791333506,
+      "grad_norm": 0.5660263299942017,
+      "kl": 0.06060791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0177,
+      "num_tokens": 49394883.0,
+      "reward": 1.09375,
+      "reward_std": 0.07157464325428009,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 433
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 656.0,
+      "completions/max_terminated_length": 656.0,
+      "completions/mean_length": 340.8035888671875,
+      "completions/mean_terminated_length": 340.8035888671875,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 0.44776889347433585,
+      "grad_norm": 0.7160601019859314,
+      "kl": 0.06365966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0156,
+      "num_tokens": 49507484.0,
+      "reward": 1.15625,
+      "reward_std": 0.08264832943677902,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.174774631857872,
+      "step": 434
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 827.0,
+      "completions/max_terminated_length": 827.0,
+      "completions/mean_length": 356.7321472167969,
+      "completions/mean_terminated_length": 356.7321472167969,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.4488006190353366,
+      "grad_norm": 0.6561874747276306,
+      "kl": 0.06512451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 49609703.0,
+      "reward": 1.1500002145767212,
+      "reward_std": 0.0671311616897583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 435
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 564.0,
+      "completions/max_terminated_length": 564.0,
+      "completions/mean_length": 326.8660888671875,
+      "completions/mean_terminated_length": 326.8660888671875,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.44983234459633736,
+      "grad_norm": 0.6523354649543762,
+      "kl": 0.076416015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0173,
+      "num_tokens": 49710334.0,
+      "reward": 1.15625,
+      "reward_std": 0.07780507951974869,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.174774631857872,
+      "step": 436
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 787.0,
+      "completions/max_terminated_length": 787.0,
+      "completions/mean_length": 367.8750305175781,
+      "completions/mean_terminated_length": 367.8750305175781,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.45086407015733815,
+      "grad_norm": 0.7713767886161804,
+      "kl": 0.06683349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0195,
+      "num_tokens": 49819765.0,
+      "reward": 1.1035715341567993,
+      "reward_std": 0.1014278456568718,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 437
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 577.0,
+      "completions/max_terminated_length": 577.0,
+      "completions/mean_length": 349.40179443359375,
+      "completions/mean_terminated_length": 349.40179443359375,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.45189579571833893,
+      "grad_norm": 0.7261008024215698,
+      "kl": 0.0672607421875,
+      "learning_rate": 1e-06,
+      "loss": -0.006,
+      "num_tokens": 49922080.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.09394123405218124,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 438
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 772.0,
+      "completions/max_terminated_length": 772.0,
+      "completions/mean_length": 378.33929443359375,
+      "completions/mean_terminated_length": 378.33929443359375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.4529275212793397,
+      "grad_norm": 0.5196535587310791,
+      "kl": 0.0665283203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 50029519.0,
+      "reward": 1.0750000476837158,
+      "reward_std": 0.05543847754597664,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 439
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 885.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 379.6785888671875,
+      "completions/mean_terminated_length": 379.6785888671875,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 0.45395924684034045,
+      "grad_norm": 0.6472377181053162,
+      "kl": 0.0679931640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0151,
+      "num_tokens": 50147587.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.06228789687156677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386,
+      "step": 440
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 713.0,
+      "completions/max_terminated_length": 713.0,
+      "completions/mean_length": 342.9464416503906,
+      "completions/mean_terminated_length": 342.9464416503906,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "epoch": 0.45499097240134123,
+      "grad_norm": 0.5558903217315674,
+      "kl": 0.0648193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 50249193.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.04957644268870354,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 441
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 664.0,
+      "completions/max_terminated_length": 664.0,
+      "completions/mean_length": 325.4375,
+      "completions/mean_terminated_length": 325.4375,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 0.456022697962342,
+      "grad_norm": 0.6881309151649475,
+      "kl": 0.0703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 50343511.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.05543847754597664,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 442
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 885.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 349.6250305175781,
+      "completions/mean_terminated_length": 349.6250305175781,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.4570544235233428,
+      "grad_norm": 0.6663326025009155,
+      "kl": 0.0712890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 50444444.0,
+      "reward": 1.1750001907348633,
+      "reward_std": 0.07678630203008652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17578651010990143,
+      "step": 443
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 575.0,
+      "completions/max_terminated_length": 575.0,
+      "completions/mean_length": 349.6607360839844,
+      "completions/mean_terminated_length": 349.6607360839844,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.4580861490843436,
+      "grad_norm": 0.5423421263694763,
+      "kl": 0.07086181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.007,
+      "num_tokens": 50546214.0,
+      "reward": 1.109375,
+      "reward_std": 0.07092426717281342,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663,
+      "step": 444
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 844.0,
+      "completions/max_terminated_length": 844.0,
+      "completions/mean_length": 373.6160888671875,
+      "completions/mean_terminated_length": 373.6160888671875,
+      "completions/min_length": 127.0,
+      "completions/min_terminated_length": 127.0,
+      "epoch": 0.4591178746453443,
+      "grad_norm": 0.7760041356086731,
+      "kl": 0.0716552734375,
+      "learning_rate": 1e-06,
+      "loss": -0.008,
+      "num_tokens": 50657149.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.08986614644527435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 445
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 838.0,
+      "completions/max_terminated_length": 838.0,
+      "completions/mean_length": 373.5535888671875,
+      "completions/mean_terminated_length": 373.5535888671875,
+      "completions/min_length": 121.0,
+      "completions/min_terminated_length": 121.0,
+      "epoch": 0.4601496002063451,
+      "grad_norm": 0.6723666191101074,
+      "kl": 0.06146240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0132,
+      "num_tokens": 50770681.0,
+      "reward": 1.0718750953674316,
+      "reward_std": 0.059231583029031754,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 446
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 665.0,
+      "completions/max_terminated_length": 665.0,
+      "completions/mean_length": 393.21429443359375,
+      "completions/mean_terminated_length": 393.21429443359375,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 0.4611813257673459,
+      "grad_norm": 0.5650410056114197,
+      "kl": 0.06182861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 50888795.0,
+      "reward": 1.071874976158142,
+      "reward_std": 0.06506221741437912,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859,
+      "step": 447
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 863.0,
+      "completions/max_terminated_length": 863.0,
+      "completions/mean_length": 367.8571472167969,
+      "completions/mean_terminated_length": 367.8571472167969,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.46221305132834667,
+      "grad_norm": 0.616245687007904,
+      "kl": 0.05908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0053,
+      "num_tokens": 50997048.0,
+      "reward": 1.1593750715255737,
+      "reward_std": 0.06953710317611694,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921,
+      "step": 448
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 736.0,
+      "completions/max_terminated_length": 736.0,
+      "completions/mean_length": 417.02679443359375,
+      "completions/mean_terminated_length": 417.02679443359375,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.46324477688934745,
+      "grad_norm": 0.5801323652267456,
+      "kl": 0.0625,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 51112964.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.05543848127126694,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728,
+      "step": 449
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 605.0,
+      "completions/max_terminated_length": 605.0,
+      "completions/mean_length": 407.3660888671875,
+      "completions/mean_terminated_length": 407.3660888671875,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.4642765024503482,
+      "grad_norm": 0.7780681252479553,
+      "kl": 0.05841064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0238,
+      "num_tokens": 51234644.0,
+      "reward": 1.0750001668930054,
+      "reward_std": 0.08610443770885468,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07500000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 450
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1089.0,
+      "completions/max_terminated_length": 1089.0,
+      "completions/mean_length": 415.6250305175781,
+      "completions/mean_terminated_length": 415.6250305175781,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 0.46530822801134897,
+      "grad_norm": 0.6879647374153137,
+      "kl": 0.06134033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0209,
+      "num_tokens": 51350991.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.08505426347255707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833,
+      "step": 451
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 669.0,
+      "completions/max_terminated_length": 669.0,
+      "completions/mean_length": 367.08038330078125,
+      "completions/mean_terminated_length": 367.08038330078125,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 0.46633995357234975,
+      "grad_norm": 0.5170190930366516,
+      "kl": 0.0662841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0199,
+      "num_tokens": 51462230.0,
+      "reward": 1.0937501192092896,
+      "reward_std": 0.056425854563713074,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056,
+      "step": 452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 983.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 342.7500305175781,
+      "completions/mean_terminated_length": 342.7500305175781,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.46737167913335054,
+      "grad_norm": 0.7710051536560059,
+      "kl": 0.06256103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 51559460.0,
+      "reward": 1.146875023841858,
+      "reward_std": 0.07919223606586456,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 453
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 422.65179443359375,
+      "completions/mean_terminated_length": 422.65179443359375,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.4684034046943513,
+      "grad_norm": 0.6395953893661499,
+      "kl": 0.0594482421875,
+      "learning_rate": 1e-06,
+      "loss": -0.01,
+      "num_tokens": 51669083.0,
+      "reward": 1.0593750476837158,
+      "reward_std": 0.06025034934282303,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 454
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 791.0,
+      "completions/max_terminated_length": 791.0,
+      "completions/mean_length": 402.9196472167969,
+      "completions/mean_terminated_length": 402.9196472167969,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 0.46943513025535205,
+      "grad_norm": 0.5968127846717834,
+      "kl": 0.05462646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0182,
+      "num_tokens": 51778162.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.054419707506895065,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 455
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 787.0,
+      "completions/max_terminated_length": 787.0,
+      "completions/mean_length": 367.3125305175781,
+      "completions/mean_terminated_length": 367.3125305175781,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.47046685581635284,
+      "grad_norm": 0.7138005495071411,
+      "kl": 0.05780029296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0184,
+      "num_tokens": 51886410.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.10945840179920197,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084,
+      "step": 456
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1215.0,
+      "completions/max_terminated_length": 1215.0,
+      "completions/mean_length": 452.2500305175781,
+      "completions/mean_terminated_length": 452.2500305175781,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.4714985813773536,
+      "grad_norm": 0.7615479230880737,
+      "kl": 0.05108642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0039,
+      "num_tokens": 52010506.0,
+      "reward": 1.1968752145767212,
+      "reward_std": 0.10396476089954376,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.21416938304901123,
+      "step": 457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 678.0,
+      "completions/max_terminated_length": 678.0,
+      "completions/mean_length": 391.77679443359375,
+      "completions/mean_terminated_length": 391.77679443359375,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 0.4725303069383544,
+      "grad_norm": 0.7827943563461304,
+      "kl": 0.064208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 52121627.0,
+      "reward": 1.1375001668930054,
+      "reward_std": 0.10603370517492294,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374,
+      "step": 458
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 758.0,
+      "completions/max_terminated_length": 758.0,
+      "completions/mean_length": 406.1696472167969,
+      "completions/mean_terminated_length": 406.1696472167969,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 0.4735620324993552,
+      "grad_norm": 0.5137292146682739,
+      "kl": 0.05517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0131,
+      "num_tokens": 52226366.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.04855767637491226,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 459
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1125.0,
+      "completions/max_terminated_length": 1125.0,
+      "completions/mean_length": 472.5000305175781,
+      "completions/mean_terminated_length": 472.5000305175781,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 0.4745937580603559,
+      "grad_norm": 0.6859979033470154,
+      "kl": 0.0482177734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 52340809.0,
+      "reward": 1.140625,
+      "reward_std": 0.10119043290615082,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 460
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2388.0,
+      "completions/max_terminated_length": 2388.0,
+      "completions/mean_length": 459.4107360839844,
+      "completions/mean_terminated_length": 459.4107360839844,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.4756254836213567,
+      "grad_norm": 0.5346077680587769,
+      "kl": 0.0545654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 52458455.0,
+      "reward": 1.0656250715255737,
+      "reward_std": 0.058863185346126556,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.13722331821918488,
+      "step": 461
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 917.0,
+      "completions/max_terminated_length": 917.0,
+      "completions/mean_length": 416.5089416503906,
+      "completions/mean_terminated_length": 416.5089416503906,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.4766572091823575,
+      "grad_norm": 0.448335736989975,
+      "kl": 0.05621337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0101,
+      "num_tokens": 52565689.0,
+      "reward": 1.1593750715255737,
+      "reward_std": 0.04130847379565239,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921,
+      "step": 462
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 417.5625305175781,
+      "completions/mean_terminated_length": 417.5625305175781,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 0.4776889347433583,
+      "grad_norm": 0.5065658688545227,
+      "kl": 0.064697265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0151,
+      "num_tokens": 52683478.0,
+      "reward": 1.125,
+      "reward_std": 0.043346013873815536,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 463
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 731.0,
+      "completions/max_terminated_length": 731.0,
+      "completions/mean_length": 429.7589416503906,
+      "completions/mean_terminated_length": 429.7589416503906,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.47872066030435906,
+      "grad_norm": 0.5872586369514465,
+      "kl": 0.0594482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0093,
+      "num_tokens": 52799266.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.06469383090734482,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597,
+      "step": 464
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 724.0,
+      "completions/max_terminated_length": 724.0,
+      "completions/mean_length": 399.9464416503906,
+      "completions/mean_terminated_length": 399.9464416503906,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.4797523858653598,
+      "grad_norm": 0.6111489534378052,
+      "kl": 0.05621337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0028,
+      "num_tokens": 52907620.0,
+      "reward": 1.1250001192092896,
+      "reward_std": 0.05784441530704498,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 465
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 863.0,
+      "completions/max_terminated_length": 863.0,
+      "completions/mean_length": 483.46429443359375,
+      "completions/mean_terminated_length": 483.46429443359375,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 0.48078411142636057,
+      "grad_norm": 0.6069656014442444,
+      "kl": 0.0506591796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 53039930.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.0812297835946083,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 466
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 806.0,
+      "completions/max_terminated_length": 806.0,
+      "completions/mean_length": 432.33038330078125,
+      "completions/mean_terminated_length": 432.33038330078125,
+      "completions/min_length": 126.0,
+      "completions/min_terminated_length": 126.0,
+      "epoch": 0.48181583698736136,
+      "grad_norm": 0.6573135256767273,
+      "kl": 0.053955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 53156692.0,
+      "reward": 1.1281251907348633,
+      "reward_std": 0.0612691231071949,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 467
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1135.0,
+      "completions/max_terminated_length": 1135.0,
+      "completions/mean_length": 406.70538330078125,
+      "completions/mean_terminated_length": 406.70538330078125,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.48284756254836214,
+      "grad_norm": 0.6067453026771545,
+      "kl": 0.05816650390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0178,
+      "num_tokens": 53271130.0,
+      "reward": 1.1781251430511475,
+      "reward_std": 0.08326731622219086,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17812499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.175758495926857,
+      "step": 468
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1820.0,
+      "completions/max_terminated_length": 1820.0,
+      "completions/mean_length": 475.1964416503906,
+      "completions/mean_terminated_length": 475.1964416503906,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.4838792881093629,
+      "grad_norm": 0.5233703851699829,
+      "kl": 0.0491943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0044,
+      "num_tokens": 53396431.0,
+      "reward": 1.078125,
+      "reward_std": 0.06228789687156677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 469
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 426.8571472167969,
+      "completions/mean_terminated_length": 426.8571472167969,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.4849110136703637,
+      "grad_norm": 0.6300012469291687,
+      "kl": 0.06439208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0101,
+      "num_tokens": 53516612.0,
+      "reward": 1.0875000953674316,
+      "reward_std": 0.07678630203008652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386,
+      "step": 470
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 867.0,
+      "completions/max_terminated_length": 867.0,
+      "completions/mean_length": 380.64288330078125,
+      "completions/mean_terminated_length": 380.64288330078125,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 0.48594273923136444,
+      "grad_norm": 0.6337840557098389,
+      "kl": 0.0531005859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0115,
+      "num_tokens": 53625506.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.06611239165067673,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 471
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 828.0,
+      "completions/max_terminated_length": 828.0,
+      "completions/mean_length": 379.6250305175781,
+      "completions/mean_terminated_length": 379.6250305175781,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 0.4869744647923652,
+      "grad_norm": 0.6945529580116272,
+      "kl": 0.0655517578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0072,
+      "num_tokens": 53728532.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.07536773383617401,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478,
+      "step": 472
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 820.0,
+      "completions/max_terminated_length": 820.0,
+      "completions/mean_length": 451.33929443359375,
+      "completions/mean_terminated_length": 451.33929443359375,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.488006190353366,
+      "grad_norm": 0.5490451455116272,
+      "kl": 0.05792236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 53845365.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.06025035306811333,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843402802944183,
+      "step": 473
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 743.0,
+      "completions/max_terminated_length": 743.0,
+      "completions/mean_length": 391.4107360839844,
+      "completions/mean_terminated_length": 391.4107360839844,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 0.4890379159143668,
+      "grad_norm": 0.6644481420516968,
+      "kl": 0.06201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0019,
+      "num_tokens": 53950705.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.08224855363368988,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597,
+      "step": 474
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 784.0,
+      "completions/max_terminated_length": 784.0,
+      "completions/mean_length": 431.27679443359375,
+      "completions/mean_terminated_length": 431.27679443359375,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 0.4900696414753676,
+      "grad_norm": 0.6886666417121887,
+      "kl": 0.057861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0109,
+      "num_tokens": 54072858.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.08607304841279984,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.20146213471889496,
+      "step": 475
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1161.0,
+      "completions/max_terminated_length": 1161.0,
+      "completions/mean_length": 463.544677734375,
+      "completions/mean_terminated_length": 463.544677734375,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 0.4911013670363683,
+      "grad_norm": 0.7024446129798889,
+      "kl": 0.05206298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.013,
+      "num_tokens": 54195594.0,
+      "reward": 1.1781251430511475,
+      "reward_std": 0.10461514443159103,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17812500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.2348836213350296,
+      "step": 476
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 830.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 426.8035888671875,
+      "completions/mean_terminated_length": 426.8035888671875,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.4921330925973691,
+      "grad_norm": 0.6875271797180176,
+      "kl": 0.05548095703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0022,
+      "num_tokens": 54305878.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.09051652997732162,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833,
+      "step": 477
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1037.0,
+      "completions/max_terminated_length": 1037.0,
+      "completions/mean_length": 460.4107360839844,
+      "completions/mean_terminated_length": 460.4107360839844,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.4931648181583699,
+      "grad_norm": 0.5122353434562683,
+      "kl": 0.05712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.017,
+      "num_tokens": 54423580.0,
+      "reward": 1.1062501668930054,
+      "reward_std": 0.05682564154267311,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 478
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 834.0,
+      "completions/max_terminated_length": 834.0,
+      "completions/mean_length": 435.1339416503906,
+      "completions/mean_terminated_length": 435.1339416503906,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.49419654371937066,
+      "grad_norm": 0.695402204990387,
+      "kl": 0.0577392578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0107,
+      "num_tokens": 54546018.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.08949775993824005,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 479
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 624.0,
+      "completions/max_terminated_length": 624.0,
+      "completions/mean_length": 379.65179443359375,
+      "completions/mean_terminated_length": 379.65179443359375,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 0.49522826928037145,
+      "grad_norm": 0.8098606467247009,
+      "kl": 0.05194091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 54648809.0,
+      "reward": 1.1687501668930054,
+      "reward_std": 0.09292247146368027,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653,
+      "step": 480
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 735.0,
+      "completions/max_terminated_length": 735.0,
+      "completions/mean_length": 416.26788330078125,
+      "completions/mean_terminated_length": 416.26788330078125,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.4962599948413722,
+      "grad_norm": 0.5609670281410217,
+      "kl": 0.06036376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0146,
+      "num_tokens": 54762638.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.06367506831884384,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 481
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 799.0,
+      "completions/max_terminated_length": 799.0,
+      "completions/mean_length": 433.0000305175781,
+      "completions/mean_terminated_length": 433.0000305175781,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 0.49729172040237296,
+      "grad_norm": 0.6345481872558594,
+      "kl": 0.06072998046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0155,
+      "num_tokens": 54876269.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.08709181845188141,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 482
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1076.0,
+      "completions/max_terminated_length": 1076.0,
+      "completions/mean_length": 427.27679443359375,
+      "completions/mean_terminated_length": 427.27679443359375,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 0.49832344596337375,
+      "grad_norm": 0.6227778792381287,
+      "kl": 0.05731201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0056,
+      "num_tokens": 54992227.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 483
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 848.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 466.4732360839844,
+      "completions/mean_terminated_length": 466.4732360839844,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 0.49935517152437453,
+      "grad_norm": 0.521126389503479,
+      "kl": 0.05126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0035,
+      "num_tokens": 55111175.0,
+      "reward": 1.1312501430511475,
+      "reward_std": 0.05784441903233528,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848,
+      "step": 484
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 449.732177734375,
+      "completions/mean_terminated_length": 449.732177734375,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 0.5003868970853753,
+      "grad_norm": 0.5438706278800964,
+      "kl": 0.0584716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 55226554.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.07197443395853043,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478,
+      "step": 485
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1259.0,
+      "completions/max_terminated_length": 1259.0,
+      "completions/mean_length": 448.08929443359375,
+      "completions/mean_terminated_length": 448.08929443359375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 0.501418622646376,
+      "grad_norm": 0.5385457873344421,
+      "kl": 0.0576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 55343184.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.07919223606586456,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597,
+      "step": 486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 957.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 412.6875305175781,
+      "completions/mean_terminated_length": 412.6875305175781,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 0.5024503482073769,
+      "grad_norm": 0.5771428346633911,
+      "kl": 0.05352783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0092,
+      "num_tokens": 55456661.0,
+      "reward": 1.1687501668930054,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653,
+      "step": 487
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1353.0,
+      "completions/max_terminated_length": 1353.0,
+      "completions/mean_length": 430.0357360839844,
+      "completions/mean_terminated_length": 430.0357360839844,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.5034820737683776,
+      "grad_norm": 0.7232245802879333,
+      "kl": 0.06512451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 55572011.0,
+      "reward": 1.140625,
+      "reward_std": 0.0981341153383255,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 488
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1030.0,
+      "completions/max_terminated_length": 1030.0,
+      "completions/mean_length": 426.83929443359375,
+      "completions/mean_terminated_length": 426.83929443359375,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.5045137993293783,
+      "grad_norm": 0.5774347186088562,
+      "kl": 0.0574951171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 55688291.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.06130051985383034,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478,
+      "step": 489
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 870.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 421.4107360839844,
+      "completions/mean_terminated_length": 421.4107360839844,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 0.5055455248903792,
+      "grad_norm": 0.6256279945373535,
+      "kl": 0.0693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0064,
+      "num_tokens": 55800477.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.08706042170524597,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 490
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 957.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 500.4107360839844,
+      "completions/mean_terminated_length": 500.4107360839844,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 0.5065772504513799,
+      "grad_norm": 0.5111896991729736,
+      "kl": 0.052734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0063,
+      "num_tokens": 55928987.0,
+      "reward": 1.0562500953674316,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483,
+      "step": 491
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 913.0,
+      "completions/max_terminated_length": 913.0,
+      "completions/mean_length": 506.52679443359375,
+      "completions/mean_terminated_length": 506.52679443359375,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "epoch": 0.5076089760123808,
+      "grad_norm": 0.534400224685669,
+      "kl": 0.05596923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0036,
+      "num_tokens": 56061479.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.06367506086826324,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645,
+      "step": 492
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 941.0,
+      "completions/max_terminated_length": 941.0,
+      "completions/mean_length": 470.15179443359375,
+      "completions/mean_terminated_length": 470.15179443359375,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 0.5086407015733815,
+      "grad_norm": 0.5752992630004883,
+      "kl": 0.06439208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0004,
+      "num_tokens": 56186124.0,
+      "reward": 1.0562500953674316,
+      "reward_std": 0.054419707506895065,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483,
+      "step": 493
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1067.0,
+      "completions/max_terminated_length": 1067.0,
+      "completions/mean_length": 459.1250305175781,
+      "completions/mean_terminated_length": 459.1250305175781,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 0.5096724271343822,
+      "grad_norm": 0.6120277047157288,
+      "kl": 0.06231689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 56308988.0,
+      "reward": 1.0660713911056519,
+      "reward_std": 0.0911223366856575,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 494
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1065.0,
+      "completions/max_terminated_length": 1065.0,
+      "completions/mean_length": 416.8214416503906,
+      "completions/mean_terminated_length": 416.8214416503906,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 0.510704152695383,
+      "grad_norm": 0.5477854013442993,
+      "kl": 0.0775146484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0134,
+      "num_tokens": 56408158.0,
+      "reward": 1.1937501430511475,
+      "reward_std": 0.06367506086826324,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19375000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.174774631857872,
+      "step": 495
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 726.0,
+      "completions/max_terminated_length": 726.0,
+      "completions/mean_length": 436.8214416503906,
+      "completions/mean_terminated_length": 436.8214416503906,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.5117358782563838,
+      "grad_norm": 0.7140277028083801,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 56517233.0,
+      "reward": 1.0750000476837158,
+      "reward_std": 0.09394123405218124,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 496
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 904.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 415.1785888671875,
+      "completions/mean_terminated_length": 415.1785888671875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.5127676038173846,
+      "grad_norm": 0.6335656642913818,
+      "kl": 0.0782470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0066,
+      "num_tokens": 56623184.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.0674995556473732,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 497
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1242.0,
+      "completions/max_terminated_length": 1242.0,
+      "completions/mean_length": 464.1160888671875,
+      "completions/mean_terminated_length": 464.1160888671875,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.5137993293783853,
+      "grad_norm": 0.6236574053764343,
+      "kl": 0.0736083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 56738354.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.0778050646185875,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 498
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 798.0,
+      "completions/max_terminated_length": 798.0,
+      "completions/mean_length": 443.01788330078125,
+      "completions/mean_terminated_length": 443.01788330078125,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 0.5148310549393861,
+      "grad_norm": 0.6208124756813049,
+      "kl": 0.0714111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 56847584.0,
+      "reward": 1.131250023841858,
+      "reward_std": 0.08505427092313766,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13125000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848,
+      "step": 499
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1099.0,
+      "completions/max_terminated_length": 1099.0,
+      "completions/mean_length": 581.5089721679688,
+      "completions/mean_terminated_length": 581.5089721679688,
+      "completions/min_length": 377.0,
+      "completions/min_terminated_length": 377.0,
+      "epoch": 0.5158627805003869,
+      "grad_norm": 0.4899929463863373,
+      "kl": 0.05731201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0102,
+      "num_tokens": 56992947.0,
+      "reward": 1.0562500953674316,
+      "reward_std": 0.05303255096077919,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483,
+      "step": 500
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 773.0,
+      "completions/max_terminated_length": 773.0,
+      "completions/mean_length": 463.8482360839844,
+      "completions/mean_terminated_length": 463.8482360839844,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.5168945060613876,
+      "grad_norm": 0.41739174723625183,
+      "kl": 0.0767822265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 57113820.0,
+      "reward": 1.0937501192092896,
+      "reward_std": 0.04130847007036209,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056,
+      "step": 501
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 681.0,
+      "completions/max_terminated_length": 681.0,
+      "completions/mean_length": 426.9910888671875,
+      "completions/mean_terminated_length": 426.9910888671875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 0.5179262316223885,
+      "grad_norm": 0.6298090815544128,
+      "kl": 0.079345703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0036,
+      "num_tokens": 57222592.0,
+      "reward": 1.234375,
+      "reward_std": 0.07882384210824966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.234375,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 502
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1367.0,
+      "completions/max_terminated_length": 1367.0,
+      "completions/mean_length": 508.7410888671875,
+      "completions/mean_terminated_length": 508.7410888671875,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.5189579571833892,
+      "grad_norm": 0.5767647624015808,
+      "kl": 0.071044921875,
+      "learning_rate": 1e-06,
+      "loss": -0.007,
+      "num_tokens": 57343628.0,
+      "reward": 1.1312501430511475,
+      "reward_std": 0.06953709572553635,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848,
+      "step": 503
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 720.0,
+      "completions/max_terminated_length": 720.0,
+      "completions/mean_length": 419.95538330078125,
+      "completions/mean_terminated_length": 419.95538330078125,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 0.51998968274439,
+      "grad_norm": 0.5730735063552856,
+      "kl": 0.080322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 57456148.0,
+      "reward": 1.1125000715255737,
+      "reward_std": 0.06851832568645477,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11249999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 504
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 959.0,
+      "completions/max_terminated_length": 959.0,
+      "completions/mean_length": 482.3482360839844,
+      "completions/mean_terminated_length": 482.3482360839844,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.5210214083053908,
+      "grad_norm": 0.7494083046913147,
+      "kl": 0.082763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0174,
+      "num_tokens": 57584952.0,
+      "reward": 1.1098215579986572,
+      "reward_std": 0.11654523760080338,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728,
+      "step": 505
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1121.0,
+      "completions/max_terminated_length": 1121.0,
+      "completions/mean_length": 560.1875,
+      "completions/mean_terminated_length": 560.1875,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 0.5220531338663915,
+      "grad_norm": 0.5996860861778259,
+      "kl": 0.064697265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0054,
+      "num_tokens": 57712674.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.0919036939740181,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.17732131481170654,
+      "step": 506
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1043.0,
+      "completions/max_terminated_length": 1043.0,
+      "completions/mean_length": 502.3035888671875,
+      "completions/mean_terminated_length": 502.3035888671875,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 0.5230848594273924,
+      "grad_norm": 0.49378275871276855,
+      "kl": 0.070068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0077,
+      "num_tokens": 57835166.0,
+      "reward": 1.0937501192092896,
+      "reward_std": 0.05886319279670715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056,
+      "step": 507
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 840.0,
+      "completions/max_terminated_length": 840.0,
+      "completions/mean_length": 483.26788330078125,
+      "completions/mean_terminated_length": 483.26788330078125,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.5241165849883931,
+      "grad_norm": 0.6792031526565552,
+      "kl": 0.06884765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0124,
+      "num_tokens": 57956702.0,
+      "reward": 1.1843751668930054,
+      "reward_std": 0.09915288537740707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18437500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17553409934043884,
+      "step": 508
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1205.0,
+      "completions/max_terminated_length": 1205.0,
+      "completions/mean_length": 552.7142944335938,
+      "completions/mean_terminated_length": 552.7142944335938,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 0.5251483105493938,
+      "grad_norm": 0.3717767596244812,
+      "kl": 0.068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0033,
+      "num_tokens": 58085169.0,
+      "reward": 1.053125023841858,
+      "reward_std": 0.03507804498076439,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582,
+      "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352,
+      "step": 509
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1499.0,
+      "completions/max_terminated_length": 1499.0,
+      "completions/mean_length": 556.4107666015625,
+      "completions/mean_terminated_length": 556.4107666015625,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.5261800361103947,
+      "grad_norm": 0.47721174359321594,
+      "kl": 0.0660400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 58219279.0,
+      "reward": 1.0625,
+      "reward_std": 0.0612691231071949,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0625,
+      "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813,
+      "step": 510
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 493.4107360839844,
+      "completions/mean_terminated_length": 493.4107360839844,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "epoch": 0.5272117616713954,
+      "grad_norm": 0.6741013526916504,
+      "kl": 0.0714111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.007,
+      "num_tokens": 58338990.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.08709181845188141,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 511
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 926.0,
+      "completions/max_terminated_length": 926.0,
+      "completions/mean_length": 514.4910888671875,
+      "completions/mean_terminated_length": 514.4910888671875,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "epoch": 0.5282434872323962,
+      "grad_norm": 0.621870219707489,
+      "kl": 0.0732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 58460460.0,
+      "reward": 1.1160715818405151,
+      "reward_std": 0.09299345314502716,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1249999925494194,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 512
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 873.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 568.6607666015625,
+      "completions/mean_terminated_length": 568.6607666015625,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.529275212793397,
+      "grad_norm": 0.6457920074462891,
+      "kl": 0.068115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0252,
+      "num_tokens": 58596495.0,
+      "reward": 1.1254465579986572,
+      "reward_std": 0.12541785836219788,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 513
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1916.0,
+      "completions/max_terminated_length": 1916.0,
+      "completions/mean_length": 594.7589721679688,
+      "completions/mean_terminated_length": 594.7589721679688,
+      "completions/min_length": 354.0,
+      "completions/min_terminated_length": 354.0,
+      "epoch": 0.5303069383543977,
+      "grad_norm": 0.6898475885391235,
+      "kl": 0.065673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0125,
+      "num_tokens": 58735922.0,
+      "reward": 1.1218750476837158,
+      "reward_std": 0.10742086172103882,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 514
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 921.0,
+      "completions/max_terminated_length": 921.0,
+      "completions/mean_length": 537.3214721679688,
+      "completions/mean_terminated_length": 537.3214721679688,
+      "completions/min_length": 297.0,
+      "completions/min_terminated_length": 297.0,
+      "epoch": 0.5313386639153985,
+      "grad_norm": 0.5636951327323914,
+      "kl": 0.072998046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0077,
+      "num_tokens": 58874929.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.08261694014072418,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713,
+      "step": 515
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1224.0,
+      "completions/max_terminated_length": 1224.0,
+      "completions/mean_length": 539.0178833007812,
+      "completions/mean_terminated_length": 539.0178833007812,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.5323703894763993,
+      "grad_norm": 0.45992597937583923,
+      "kl": 0.0703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0209,
+      "num_tokens": 59009282.0,
+      "reward": 1.0906251668930054,
+      "reward_std": 0.0612691268324852,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 516
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1305.0,
+      "completions/max_terminated_length": 1305.0,
+      "completions/mean_length": 499.58929443359375,
+      "completions/mean_terminated_length": 499.58929443359375,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 0.5334021150374001,
+      "grad_norm": 0.3661348223686218,
+      "kl": 0.07275390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0118,
+      "num_tokens": 59131631.0,
+      "reward": 1.1723216772079468,
+      "reward_std": 0.0601193830370903,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.21520207822322845,
+      "step": 517
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1206.0,
+      "completions/max_terminated_length": 1206.0,
+      "completions/mean_length": 483.08038330078125,
+      "completions/mean_terminated_length": 483.08038330078125,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.5344338405984008,
+      "grad_norm": 0.7646259069442749,
+      "kl": 0.07177734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0005,
+      "num_tokens": 59252355.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.11427027732133865,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 518
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 944.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 510.5625305175781,
+      "completions/mean_terminated_length": 510.5625305175781,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 0.5354655661594016,
+      "grad_norm": 0.6553826928138733,
+      "kl": 0.06451416015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 59373826.0,
+      "reward": 1.1375000476837158,
+      "reward_std": 0.10803984105587006,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170315980911255,
+      "step": 519
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 878.0,
+      "completions/max_terminated_length": 878.0,
+      "completions/mean_length": 511.0714416503906,
+      "completions/mean_terminated_length": 511.0714416503906,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.5364972917204024,
+      "grad_norm": 0.5374631285667419,
+      "kl": 0.0660400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0016,
+      "num_tokens": 59500408.0,
+      "reward": 1.0906251668930054,
+      "reward_std": 0.07780507206916809,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 520
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 974.0,
+      "completions/max_terminated_length": 974.0,
+      "completions/mean_length": 514.732177734375,
+      "completions/mean_terminated_length": 514.732177734375,
+      "completions/min_length": 293.0,
+      "completions/min_terminated_length": 293.0,
+      "epoch": 0.5375290172814031,
+      "grad_norm": 0.5410560965538025,
+      "kl": 0.066162109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0133,
+      "num_tokens": 59633840.0,
+      "reward": 1.0723215341567993,
+      "reward_std": 0.080716073513031,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 521
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 820.0,
+      "completions/max_terminated_length": 820.0,
+      "completions/mean_length": 483.14288330078125,
+      "completions/mean_terminated_length": 483.14288330078125,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 0.538560742842404,
+      "grad_norm": 0.4773089289665222,
+      "kl": 0.0733642578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0031,
+      "num_tokens": 59756818.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.04957644268870354,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728,
+      "step": 522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1191.0,
+      "completions/max_terminated_length": 1191.0,
+      "completions/mean_length": 482.8035888671875,
+      "completions/mean_terminated_length": 482.8035888671875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 0.5395924684034047,
+      "grad_norm": 0.3887348473072052,
+      "kl": 0.06591796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 59875929.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.03405927121639252,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 523
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 472.857177734375,
+      "completions/mean_terminated_length": 472.857177734375,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.5406241939644054,
+      "grad_norm": 0.7101907730102539,
+      "kl": 0.064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0268,
+      "num_tokens": 59997209.0,
+      "reward": 1.1723215579986572,
+      "reward_std": 0.11900390684604645,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653,
+      "step": 524
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 893.0,
+      "completions/max_terminated_length": 893.0,
+      "completions/mean_length": 454.4375305175781,
+      "completions/mean_terminated_length": 454.4375305175781,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.5416559195254063,
+      "grad_norm": 0.6443462371826172,
+      "kl": 0.07470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0051,
+      "num_tokens": 60117756.0,
+      "reward": 1.0968750715255737,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645,
+      "step": 525
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 524.875,
+      "completions/mean_terminated_length": 492.7027282714844,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 0.542687645086407,
+      "grad_norm": 0.5063578486442566,
+      "kl": 0.06024169921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0342,
+      "num_tokens": 60247921.0,
+      "reward": 1.0660715103149414,
+      "reward_std": 0.09110311418771744,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07500000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 526
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 849.0,
+      "completions/max_terminated_length": 849.0,
+      "completions/mean_length": 468.0089416503906,
+      "completions/mean_terminated_length": 468.0089416503906,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 0.5437193706474078,
+      "grad_norm": 0.6476441621780396,
+      "kl": 0.0732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 60367265.0,
+      "reward": 1.1062501668930054,
+      "reward_std": 0.10220920294523239,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 527
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 956.0,
+      "completions/max_terminated_length": 956.0,
+      "completions/mean_length": 482.4464416503906,
+      "completions/mean_terminated_length": 482.4464416503906,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.5447510962084086,
+      "grad_norm": 0.6608678698539734,
+      "kl": 0.06353759765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0134,
+      "num_tokens": 60495870.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.09193507581949234,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 528
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1126.0,
+      "completions/max_terminated_length": 1126.0,
+      "completions/mean_length": 552.7678833007812,
+      "completions/mean_terminated_length": 552.7678833007812,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 0.5457828217694093,
+      "grad_norm": 0.650008499622345,
+      "kl": 0.06597900390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0118,
+      "num_tokens": 60635389.0,
+      "reward": 1.078125,
+      "reward_std": 0.0812297835946083,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 529
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 718.0,
+      "completions/max_terminated_length": 718.0,
+      "completions/mean_length": 441.58038330078125,
+      "completions/mean_terminated_length": 441.58038330078125,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.5468145473304101,
+      "grad_norm": 0.43306875228881836,
+      "kl": 0.0657958984375,
+      "learning_rate": 1e-06,
+      "loss": -0.01,
+      "num_tokens": 60754881.0,
+      "reward": 1.0593750476837158,
+      "reward_std": 0.038902536034584045,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 530
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 855.0,
+      "completions/max_terminated_length": 855.0,
+      "completions/mean_length": 441.5357360839844,
+      "completions/mean_terminated_length": 441.5357360839844,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.5478462728914109,
+      "grad_norm": 0.7140151858329773,
+      "kl": 0.069091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0095,
+      "num_tokens": 60871015.0,
+      "reward": 1.1375000476837158,
+      "reward_std": 0.09572818130254745,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374,
+      "step": 531
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1150.0,
+      "completions/max_terminated_length": 1150.0,
+      "completions/mean_length": 481.357177734375,
+      "completions/mean_terminated_length": 481.357177734375,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 0.5488779984524117,
+      "grad_norm": 0.6157094836235046,
+      "kl": 0.064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 60989068.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.10017165541648865,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 532
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1037.0,
+      "completions/max_terminated_length": 1037.0,
+      "completions/mean_length": 531.7678833007812,
+      "completions/mean_terminated_length": 531.7678833007812,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 0.5499097240134124,
+      "grad_norm": 0.35044318437576294,
+      "kl": 0.05377197265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0062,
+      "num_tokens": 61114783.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.03202172368764877,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 533
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 464.21429443359375,
+      "completions/mean_terminated_length": 464.21429443359375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 0.5509414495744132,
+      "grad_norm": 0.5607831478118896,
+      "kl": 0.062744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0195,
+      "num_tokens": 61241955.0,
+      "reward": 1.1406251192092896,
+      "reward_std": 0.06956849992275238,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658,
+      "step": 534
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 953.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 494.96429443359375,
+      "completions/mean_terminated_length": 494.96429443359375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 0.551973175135414,
+      "grad_norm": 0.6276484727859497,
+      "kl": 0.0616455078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0102,
+      "num_tokens": 61356149.0,
+      "reward": 1.0723214149475098,
+      "reward_std": 0.09315988421440125,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 535
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 897.0,
+      "completions/max_terminated_length": 897.0,
+      "completions/mean_length": 509.419677734375,
+      "completions/mean_terminated_length": 509.419677734375,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 0.5530049006964147,
+      "grad_norm": 0.5943368673324585,
+      "kl": 0.05865478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0084,
+      "num_tokens": 61483617.0,
+      "reward": 1.09375,
+      "reward_std": 0.0674995481967926,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 536
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1164.0,
+      "completions/max_terminated_length": 1164.0,
+      "completions/mean_length": 556.4464721679688,
+      "completions/mean_terminated_length": 556.4464721679688,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "epoch": 0.5540366262574156,
+      "grad_norm": 0.45444533228874207,
+      "kl": 0.05535888671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0025,
+      "num_tokens": 61618039.0,
+      "reward": 1.100000023841858,
+      "reward_std": 0.060250356793403625,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345,
+      "step": 537
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 488.044677734375,
+      "completions/mean_terminated_length": 488.044677734375,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.5550683518184163,
+      "grad_norm": 0.49808990955352783,
+      "kl": 0.06365966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0049,
+      "num_tokens": 61745598.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.0660809874534607,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 538
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1487.0,
+      "completions/max_terminated_length": 1487.0,
+      "completions/mean_length": 534.7232666015625,
+      "completions/mean_terminated_length": 534.7232666015625,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.556100077379417,
+      "grad_norm": 0.369132399559021,
+      "kl": 0.06011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 61879071.0,
+      "reward": 1.0750000476837158,
+      "reward_std": 0.03547782823443413,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.144259512424469,
+      "step": 539
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 482.83929443359375,
+      "completions/mean_terminated_length": 482.83929443359375,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.5571318029404179,
+      "grad_norm": 0.39808061718940735,
+      "kl": 0.0628662109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 61998235.0,
+      "reward": 1.15625,
+      "reward_std": 0.04028969630599022,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.174774631857872,
+      "step": 540
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 791.0,
+      "completions/max_terminated_length": 791.0,
+      "completions/mean_length": 445.3035888671875,
+      "completions/mean_terminated_length": 445.3035888671875,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.5581635285014186,
+      "grad_norm": 0.7094811201095581,
+      "kl": 0.0657958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0376,
+      "num_tokens": 62112565.0,
+      "reward": 1.1593750715255737,
+      "reward_std": 0.1025775894522667,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921,
+      "step": 541
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 861.0,
+      "completions/max_terminated_length": 861.0,
+      "completions/mean_length": 454.7232360839844,
+      "completions/mean_terminated_length": 454.7232360839844,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "epoch": 0.5591952540624194,
+      "grad_norm": 0.6402541995048523,
+      "kl": 0.0628662109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 62222651.0,
+      "reward": 1.1375000476837158,
+      "reward_std": 0.09088490903377533,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170318961143494,
+      "step": 542
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 951.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 481.2500305175781,
+      "completions/mean_terminated_length": 481.2500305175781,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 0.5602269796234202,
+      "grad_norm": 0.7908324003219604,
+      "kl": 0.06268310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0249,
+      "num_tokens": 62341007.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.12392540276050568,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 543
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 773.0,
+      "completions/max_terminated_length": 773.0,
+      "completions/mean_length": 472.919677734375,
+      "completions/mean_terminated_length": 472.919677734375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.5612587051844209,
+      "grad_norm": 0.493557333946228,
+      "kl": 0.064208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 62467740.0,
+      "reward": 1.1125001907348633,
+      "reward_std": 0.04615173488855362,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11249999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437,
+      "step": 544
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 931.0,
+      "completions/max_terminated_length": 931.0,
+      "completions/mean_length": 475.96429443359375,
+      "completions/mean_terminated_length": 475.96429443359375,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "epoch": 0.5622904307454217,
+      "grad_norm": 0.5978304743766785,
+      "kl": 0.05908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0064,
+      "num_tokens": 62586631.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.07641790807247162,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464,
+      "step": 545
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1043.0,
+      "completions/max_terminated_length": 1043.0,
+      "completions/mean_length": 503.8839416503906,
+      "completions/mean_terminated_length": 503.8839416503906,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 0.5633221563064225,
+      "grad_norm": 0.38978615403175354,
+      "kl": 0.06207275390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0098,
+      "num_tokens": 62713493.0,
+      "reward": 1.1468751430511475,
+      "reward_std": 0.04130847379565239,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847,
+      "step": 546
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 475.6160888671875,
+      "completions/mean_terminated_length": 475.6160888671875,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.5643538818674233,
+      "grad_norm": 0.5205122828483582,
+      "kl": 0.06402587890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0149,
+      "num_tokens": 62834480.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.05784441903233528,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 547
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 859.0,
+      "completions/max_terminated_length": 859.0,
+      "completions/mean_length": 437.3839416503906,
+      "completions/mean_terminated_length": 437.3839416503906,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.565385607428424,
+      "grad_norm": 0.7416389584541321,
+      "kl": 0.0648193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0056,
+      "num_tokens": 62945249.0,
+      "reward": 1.1687501668930054,
+      "reward_std": 0.12115109711885452,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653,
+      "step": 548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 918.0,
+      "completions/max_terminated_length": 918.0,
+      "completions/mean_length": 468.8750305175781,
+      "completions/mean_terminated_length": 468.8750305175781,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.5664173329894248,
+      "grad_norm": 0.7131133079528809,
+      "kl": 0.06671142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 63073230.0,
+      "reward": 1.1129463911056519,
+      "reward_std": 0.13358858227729797,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.17395521700382233,
+      "step": 549
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 964.0,
+      "completions/mean_length": 520.1785888671875,
+      "completions/mean_terminated_length": 487.9639892578125,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.5674490585504256,
+      "grad_norm": 0.5437191128730774,
+      "kl": 0.06109619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.034,
+      "num_tokens": 63195057.0,
+      "reward": 1.1348214149475098,
+      "reward_std": 0.09237737953662872,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597,
+      "step": 550
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1045.0,
+      "completions/max_terminated_length": 1045.0,
+      "completions/mean_length": 460.2500305175781,
+      "completions/mean_terminated_length": 460.2500305175781,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.5684807841114263,
+      "grad_norm": 0.5379366278648376,
+      "kl": 0.0653076171875,
+      "learning_rate": 1e-06,
+      "loss": 0.014,
+      "num_tokens": 63309402.0,
+      "reward": 1.1281250715255737,
+      "reward_std": 0.0671311616897583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833,
+      "step": 551
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 950.0,
+      "completions/max_terminated_length": 950.0,
+      "completions/mean_length": 493.6964416503906,
+      "completions/mean_terminated_length": 493.6964416503906,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 0.5695125096724272,
+      "grad_norm": 0.5773839354515076,
+      "kl": 0.06378173828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0048,
+      "num_tokens": 63438798.0,
+      "reward": 1.0593751668930054,
+      "reward_std": 0.06268768012523651,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553,
+      "step": 552
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 896.0,
+      "completions/max_terminated_length": 896.0,
+      "completions/mean_length": 489.7232360839844,
+      "completions/mean_terminated_length": 489.7232360839844,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 0.5705442352334279,
+      "grad_norm": 0.5632832646369934,
+      "kl": 0.05279541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 63558145.0,
+      "reward": 1.134374976158142,
+      "reward_std": 0.0674995481967926,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 553
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1010.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 471.5625305175781,
+      "completions/mean_terminated_length": 471.5625305175781,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.5715759607944286,
+      "grad_norm": 0.5308297872543335,
+      "kl": 0.0601806640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0149,
+      "num_tokens": 63677096.0,
+      "reward": 1.1062501668930054,
+      "reward_std": 0.07055586576461792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 554
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 713.0,
+      "completions/max_terminated_length": 713.0,
+      "completions/mean_length": 427.70538330078125,
+      "completions/mean_terminated_length": 427.70538330078125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 0.5726076863554295,
+      "grad_norm": 0.6717932224273682,
+      "kl": 0.0667724609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0149,
+      "num_tokens": 63787894.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.08709181845188141,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 555
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 740.0,
+      "completions/max_terminated_length": 740.0,
+      "completions/mean_length": 407.8035888671875,
+      "completions/mean_terminated_length": 407.8035888671875,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.5736394119164302,
+      "grad_norm": 0.6472453474998474,
+      "kl": 0.068115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 63890800.0,
+      "reward": 1.1375000476837158,
+      "reward_std": 0.07438036054372787,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374,
+      "step": 556
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 431.83038330078125,
+      "completions/mean_terminated_length": 431.83038330078125,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.574671137477431,
+      "grad_norm": 0.7740367650985718,
+      "kl": 0.064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0077,
+      "num_tokens": 64010733.0,
+      "reward": 1.21875,
+      "reward_std": 0.09434102475643158,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21875,
+      "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848,
+      "step": 557
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 728.0,
+      "completions/max_terminated_length": 728.0,
+      "completions/mean_length": 425.90179443359375,
+      "completions/mean_terminated_length": 425.90179443359375,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 0.5757028630384318,
+      "grad_norm": 0.6837030053138733,
+      "kl": 0.08349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0121,
+      "num_tokens": 64118634.0,
+      "reward": 1.1156251430511475,
+      "reward_std": 0.07197443395853043,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493,
+      "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892,
+      "step": 558
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 476.1607360839844,
+      "completions/mean_terminated_length": 476.1607360839844,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.5767345885994325,
+      "grad_norm": 0.5691381692886353,
+      "kl": 0.05963134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 64243518.0,
+      "reward": 1.0843751430511475,
+      "reward_std": 0.0671311616897583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536,
+      "step": 559
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 508.7857360839844,
+      "completions/mean_terminated_length": 508.7857360839844,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 0.5777663141604333,
+      "grad_norm": 0.5485077500343323,
+      "kl": 0.05517578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0099,
+      "num_tokens": 64372532.0,
+      "reward": 1.1066964864730835,
+      "reward_std": 0.10006203502416611,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901,
+      "step": 560
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 805.0,
+      "completions/max_terminated_length": 805.0,
+      "completions/mean_length": 455.6964416503906,
+      "completions/mean_terminated_length": 455.6964416503906,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.5787980397214341,
+      "grad_norm": 0.6604443192481995,
+      "kl": 0.0601806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 64496601.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.06509362161159515,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 561
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 845.0,
+      "completions/max_terminated_length": 845.0,
+      "completions/mean_length": 431.6339416503906,
+      "completions/mean_terminated_length": 431.6339416503906,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.5798297652824349,
+      "grad_norm": 0.613683819770813,
+      "kl": 0.09552001953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 64608560.0,
+      "reward": 1.1843751668930054,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18437500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.17553408443927765,
+      "step": 562
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 642.0,
+      "completions/max_terminated_length": 642.0,
+      "completions/mean_length": 409.02679443359375,
+      "completions/mean_terminated_length": 409.02679443359375,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 0.5808614908434356,
+      "grad_norm": 0.6886749863624573,
+      "kl": 0.0592041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0062,
+      "num_tokens": 64719429.0,
+      "reward": 1.1187500953674316,
+      "reward_std": 0.10359636694192886,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728,
+      "step": 563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 940.0,
+      "completions/max_terminated_length": 940.0,
+      "completions/mean_length": 417.1339416503906,
+      "completions/mean_terminated_length": 417.1339416503906,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 0.5818932164044365,
+      "grad_norm": 0.6410400867462158,
+      "kl": 0.0565185546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0076,
+      "num_tokens": 64835161.0,
+      "reward": 1.1968752145767212,
+      "reward_std": 0.08746021240949631,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067,
+      "step": 564
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 923.0,
+      "completions/max_terminated_length": 923.0,
+      "completions/mean_length": 451.357177734375,
+      "completions/mean_terminated_length": 451.357177734375,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 0.5829249419654372,
+      "grad_norm": 0.6501334309577942,
+      "kl": 0.06207275390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 64963273.0,
+      "reward": 1.1250001192092896,
+      "reward_std": 0.08122977614402771,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 565
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 974.0,
+      "completions/max_terminated_length": 974.0,
+      "completions/mean_length": 456.83038330078125,
+      "completions/mean_terminated_length": 456.83038330078125,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.5839566675264379,
+      "grad_norm": 0.34422266483306885,
+      "kl": 0.05535888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 65084305.0,
+      "reward": 1.1000001430511475,
+      "reward_std": 0.035477831959724426,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612,
+      "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464,
+      "step": 566
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 764.0,
+      "completions/max_terminated_length": 764.0,
+      "completions/mean_length": 453.9285888671875,
+      "completions/mean_terminated_length": 453.9285888671875,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.5849883930874388,
+      "grad_norm": 0.4713008999824524,
+      "kl": 0.06524658203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0203,
+      "num_tokens": 65204053.0,
+      "reward": 1.125,
+      "reward_std": 0.05096360668540001,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.1985306292772293,
+      "step": 567
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1047.0,
+      "completions/max_terminated_length": 1047.0,
+      "completions/mean_length": 482.1964416503906,
+      "completions/mean_terminated_length": 482.1964416503906,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 0.5860201186484395,
+      "grad_norm": 0.6445662975311279,
+      "kl": 0.0772705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0229,
+      "num_tokens": 65331482.0,
+      "reward": 1.09375,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 568
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 914.0,
+      "completions/max_terminated_length": 914.0,
+      "completions/mean_length": 421.3839416503906,
+      "completions/mean_terminated_length": 421.3839416503906,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.5870518442094403,
+      "grad_norm": 0.6060656905174255,
+      "kl": 0.05853271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 65449356.0,
+      "reward": 1.1062501668930054,
+      "reward_std": 0.0671311616897583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 569
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1181.0,
+      "completions/max_terminated_length": 1181.0,
+      "completions/mean_length": 414.0982360839844,
+      "completions/mean_terminated_length": 414.0982360839844,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.5880835697704411,
+      "grad_norm": 0.7277713418006897,
+      "kl": 0.06231689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 65561199.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.07438036799430847,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 570
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 745.0,
+      "completions/max_terminated_length": 745.0,
+      "completions/mean_length": 425.7946472167969,
+      "completions/mean_terminated_length": 425.7946472167969,
+      "completions/min_length": 133.0,
+      "completions/min_terminated_length": 133.0,
+      "epoch": 0.5891152953314418,
+      "grad_norm": 0.6139292120933533,
+      "kl": 0.05096435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0131,
+      "num_tokens": 65677621.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.07095566391944885,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437500596046448,
+      "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134,
+      "step": 571
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 807.0,
+      "completions/max_terminated_length": 807.0,
+      "completions/mean_length": 420.26788330078125,
+      "completions/mean_terminated_length": 420.26788330078125,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 0.5901470208924426,
+      "grad_norm": 0.40907788276672363,
+      "kl": 0.0604248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0001,
+      "num_tokens": 65801278.0,
+      "reward": 1.09375,
+      "reward_std": 0.03507804498076439,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937,
+      "step": 572
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 442.794677734375,
+      "completions/mean_terminated_length": 442.794677734375,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.5911787464534434,
+      "grad_norm": 0.7001194357872009,
+      "kl": 0.059814453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0214,
+      "num_tokens": 65922446.0,
+      "reward": 1.0906250476837158,
+      "reward_std": 0.10119043290615082,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364,
+      "step": 573
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 787.0,
+      "completions/max_terminated_length": 787.0,
+      "completions/mean_length": 375.0982360839844,
+      "completions/mean_terminated_length": 375.0982360839844,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.5922104720144442,
+      "grad_norm": 0.9055859446525574,
+      "kl": 0.06549072265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0242,
+      "num_tokens": 66025770.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.12392540276050568,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808,
+      "step": 574
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 778.0,
+      "completions/max_terminated_length": 778.0,
+      "completions/mean_length": 357.65179443359375,
+      "completions/mean_terminated_length": 357.65179443359375,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.5932421975754449,
+      "grad_norm": 0.8322078585624695,
+      "kl": 0.06396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0189,
+      "num_tokens": 66127049.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.11288311332464218,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 575
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 549.0,
+      "completions/max_terminated_length": 549.0,
+      "completions/mean_length": 346.6964416503906,
+      "completions/mean_terminated_length": 346.6964416503906,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.5942739231364457,
+      "grad_norm": 0.8472862839698792,
+      "kl": 0.06304931640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 66230832.0,
+      "reward": 1.125,
+      "reward_std": 0.10603369772434235,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446,
+      "step": 576
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 849.0,
+      "completions/max_terminated_length": 849.0,
+      "completions/mean_length": 445.46429443359375,
+      "completions/mean_terminated_length": 445.46429443359375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.5953056486974465,
+      "grad_norm": 0.5997181534767151,
+      "kl": 0.0504150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 66349310.0,
+      "reward": 1.078125238418579,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437,
+      "step": 577
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 852.0,
+      "completions/max_terminated_length": 852.0,
+      "completions/mean_length": 381.1875305175781,
+      "completions/mean_terminated_length": 381.1875305175781,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 0.5963373742584472,
+      "grad_norm": 0.5325617790222168,
+      "kl": 0.056396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0045,
+      "num_tokens": 66459623.0,
+      "reward": 1.1500002145767212,
+      "reward_std": 0.054419711232185364,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064,
+      "step": 578
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 420.3571472167969,
+      "completions/mean_terminated_length": 420.3571472167969,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.5973690998194481,
+      "grad_norm": 0.5707572102546692,
+      "kl": 0.06243896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 66577795.0,
+      "reward": 1.1062501668930054,
+      "reward_std": 0.06025034934282303,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274,
+      "step": 579
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1044.0,
+      "completions/max_terminated_length": 1044.0,
+      "completions/mean_length": 419.8839416503906,
+      "completions/mean_terminated_length": 419.8839416503906,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.5984008253804488,
+      "grad_norm": 0.6615480780601501,
+      "kl": 0.06341552734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0019,
+      "num_tokens": 66686863.0,
+      "reward": 1.0812500715255737,
+      "reward_std": 0.08021100610494614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836,
+      "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064,
+      "step": 580
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 359.9375305175781,
+      "completions/mean_terminated_length": 359.9375305175781,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 0.5994325509414495,
+      "grad_norm": 0.8319998979568481,
+      "kl": 0.06036376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0033,
+      "num_tokens": 66783085.0,
+      "reward": 1.2062499523162842,
+      "reward_std": 0.10600230097770691,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20624998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597,
+      "step": 581
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 994.0,
+      "completions/max_terminated_length": 994.0,
+      "completions/mean_length": 397.4196472167969,
+      "completions/mean_terminated_length": 397.4196472167969,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.6004642765024504,
+      "grad_norm": 0.7436203360557556,
+      "kl": 0.0596923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0172,
+      "num_tokens": 66894089.0,
+      "reward": 1.1656252145767212,
+      "reward_std": 0.09292246401309967,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16562499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.21508759260177612,
+      "step": 582
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 895.0,
+      "completions/max_terminated_length": 895.0,
+      "completions/mean_length": 374.9910888671875,
+      "completions/mean_terminated_length": 374.9910888671875,
+      "completions/min_length": 109.0,
+      "completions/min_terminated_length": 109.0,
+      "epoch": 0.6014960020634511,
+      "grad_norm": 0.6285607218742371,
+      "kl": 0.06005859375,
+      "learning_rate": 1e-06,
+      "loss": -0.01,
+      "num_tokens": 66996897.0,
+      "reward": 1.28125,
+      "reward_std": 0.09340823441743851,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.3145943284034729,
+      "step": 583
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 933.0,
+      "completions/max_terminated_length": 933.0,
+      "completions/mean_length": 382.52679443359375,
+      "completions/mean_terminated_length": 382.52679443359375,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.602527727624452,
+      "grad_norm": 0.8746618628501892,
+      "kl": 0.06982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0165,
+      "num_tokens": 67112211.0,
+      "reward": 1.296875,
+      "reward_std": 0.17254236340522766,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3052307665348053,
+      "step": 584
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1027.0,
+      "completions/max_terminated_length": 1027.0,
+      "completions/mean_length": 450.8660888671875,
+      "completions/mean_terminated_length": 450.8660888671875,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.6035594531854527,
+      "grad_norm": 0.6751203536987305,
+      "kl": 0.0572509765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0107,
+      "num_tokens": 67227780.0,
+      "reward": 1.2379465103149414,
+      "reward_std": 0.14901825785636902,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.31035298109054565,
+      "step": 585
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 841.0,
+      "completions/max_terminated_length": 841.0,
+      "completions/mean_length": 417.1339416503906,
+      "completions/mean_terminated_length": 417.1339416503906,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 0.6045911787464534,
+      "grad_norm": 0.7396414279937744,
+      "kl": 0.05865478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 67341223.0,
+      "reward": 1.318750023841858,
+      "reward_std": 0.1882440447807312,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31874996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3501688539981842,
+      "step": 586
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 666.0,
+      "completions/max_terminated_length": 666.0,
+      "completions/mean_length": 390.5000305175781,
+      "completions/mean_terminated_length": 390.5000305175781,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.6056229043074542,
+      "grad_norm": 0.8871374130249023,
+      "kl": 0.05767822265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0077,
+      "num_tokens": 67453937.0,
+      "reward": 1.2468751668930054,
+      "reward_std": 0.24108856916427612,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.31035298109054565,
+      "step": 587
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2066.0,
+      "completions/max_terminated_length": 2066.0,
+      "completions/mean_length": 462.2232360839844,
+      "completions/mean_terminated_length": 462.2232360839844,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.606654629868455,
+      "grad_norm": 0.6875455975532532,
+      "kl": 0.0528564453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0508,
+      "num_tokens": 67570736.0,
+      "reward": 1.3125,
+      "reward_std": 0.18902026116847992,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.30935123562812805,
+      "step": 588
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 743.0,
+      "completions/max_terminated_length": 743.0,
+      "completions/mean_length": 376.1160888671875,
+      "completions/mean_terminated_length": 376.1160888671875,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.6076863554294558,
+      "grad_norm": 0.9995030164718628,
+      "kl": 0.06695556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0143,
+      "num_tokens": 67690036.0,
+      "reward": 1.3000000715255737,
+      "reward_std": 0.24223896861076355,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3039529323577881,
+      "step": 589
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 915.0,
+      "completions/max_terminated_length": 915.0,
+      "completions/mean_length": 406.33929443359375,
+      "completions/mean_terminated_length": 406.33929443359375,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 0.6087180809904565,
+      "grad_norm": 0.6139847040176392,
+      "kl": 0.0670166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0312,
+      "num_tokens": 67802067.0,
+      "reward": 1.1656250953674316,
+      "reward_std": 0.09088490903377533,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16562499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.22996585071086884,
+      "step": 590
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 777.0,
+      "completions/max_terminated_length": 777.0,
+      "completions/mean_length": 392.45538330078125,
+      "completions/mean_terminated_length": 392.45538330078125,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 0.6097498065514573,
+      "grad_norm": 0.7633903622627258,
+      "kl": 0.0614013671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 67911290.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.14392894506454468,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3313588798046112,
+      "step": 591
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 934.0,
+      "completions/max_terminated_length": 934.0,
+      "completions/mean_length": 421.5089416503906,
+      "completions/mean_terminated_length": 421.5089416503906,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 0.6107815321124581,
+      "grad_norm": 0.790761411190033,
+      "kl": 0.05780029296875,
+      "learning_rate": 1e-06,
+      "loss": -0.014,
+      "num_tokens": 68021135.0,
+      "reward": 1.1754463911056519,
+      "reward_std": 0.19190625846385956,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18437500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.26972052454948425,
+      "step": 592
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 794.0,
+      "completions/max_terminated_length": 794.0,
+      "completions/mean_length": 396.1607360839844,
+      "completions/mean_terminated_length": 396.1607360839844,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.6118132576734588,
+      "grad_norm": 0.7488933801651001,
+      "kl": 0.0621337890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0115,
+      "num_tokens": 68133097.0,
+      "reward": 1.2906252145767212,
+      "reward_std": 0.12345291674137115,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30406635999679565,
+      "step": 593
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1066.0,
+      "completions/max_terminated_length": 1066.0,
+      "completions/mean_length": 434.6160888671875,
+      "completions/mean_terminated_length": 434.6160888671875,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 0.6128449832344597,
+      "grad_norm": 0.6970859169960022,
+      "kl": 0.0609130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0015,
+      "num_tokens": 68251529.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.10980459302663803,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3114938735961914,
+      "step": 594
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 957.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 446.33038330078125,
+      "completions/mean_terminated_length": 446.33038330078125,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 0.6138767087954604,
+      "grad_norm": 0.7522033452987671,
+      "kl": 0.06707763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 68380800.0,
+      "reward": 1.3218750953674316,
+      "reward_std": 0.12741291522979736,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.3013319671154022,
+      "step": 595
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 924.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 433.8839416503906,
+      "completions/mean_terminated_length": 433.8839416503906,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.6149084343564611,
+      "grad_norm": 0.7475255727767944,
+      "kl": 0.06756591796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 68501201.0,
+      "reward": 1.1687501668930054,
+      "reward_std": 0.11087696254253387,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16875001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.25724852085113525,
+      "step": 596
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1013.0,
+      "completions/max_terminated_length": 1013.0,
+      "completions/mean_length": 435.8750305175781,
+      "completions/mean_terminated_length": 435.8750305175781,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.615940159917462,
+      "grad_norm": 0.6649345755577087,
+      "kl": 0.05682373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 68617653.0,
+      "reward": 1.2312501668930054,
+      "reward_std": 0.13754288852214813,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23125000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2954707145690918,
+      "step": 597
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1253.0,
+      "completions/max_terminated_length": 1253.0,
+      "completions/mean_length": 492.7857360839844,
+      "completions/mean_terminated_length": 492.7857360839844,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.6169718854784627,
+      "grad_norm": 0.6977077722549438,
+      "kl": 0.05615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 68741424.0,
+      "reward": 1.1937501430511475,
+      "reward_std": 0.13318131864070892,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19375000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.2522979974746704,
+      "step": 598
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1048.0,
+      "completions/max_terminated_length": 1048.0,
+      "completions/mean_length": 494.107177734375,
+      "completions/mean_terminated_length": 494.107177734375,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.6180036110394636,
+      "grad_norm": 0.7319642901420593,
+      "kl": 0.060302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0247,
+      "num_tokens": 68870222.0,
+      "reward": 1.215625286102295,
+      "reward_std": 0.18056097626686096,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2905440032482147,
+      "step": 599
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1302.0,
+      "completions/max_terminated_length": 1302.0,
+      "completions/mean_length": 444.33038330078125,
+      "completions/mean_terminated_length": 444.33038330078125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.6190353366004643,
+      "grad_norm": 0.7302688956260681,
+      "kl": 0.06500244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0139,
+      "num_tokens": 68996007.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.18273977935314178,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3123941123485565,
+      "step": 600
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 802.0,
+      "completions/max_terminated_length": 802.0,
+      "completions/mean_length": 391.4910888671875,
+      "completions/mean_terminated_length": 391.4910888671875,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "epoch": 0.620067062161465,
+      "grad_norm": 0.73208087682724,
+      "kl": 0.072021484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 69102700.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.12240273505449295,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.30984458327293396,
+      "step": 601
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1196.0,
+      "completions/max_terminated_length": 1196.0,
+      "completions/mean_length": 432.6607360839844,
+      "completions/mean_terminated_length": 432.6607360839844,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 0.6210987877224659,
+      "grad_norm": 0.6676474809646606,
+      "kl": 0.06787109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0282,
+      "num_tokens": 69218995.0,
+      "reward": 1.2468751668930054,
+      "reward_std": 0.14453645050525665,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687497317790985,
+      "rewards/curriculum_aware_reward_fn/std": 0.2920324206352234,
+      "step": 602
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1383.0,
+      "completions/max_terminated_length": 1383.0,
+      "completions/mean_length": 495.6785888671875,
+      "completions/mean_terminated_length": 495.6785888671875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.6221305132834666,
+      "grad_norm": 0.6866009831428528,
+      "kl": 0.06170654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0374,
+      "num_tokens": 69344560.0,
+      "reward": 1.2156251668930054,
+      "reward_std": 0.1428852528333664,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21562497317790985,
+      "rewards/curriculum_aware_reward_fn/std": 0.2789160907268524,
+      "step": 603
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1818.0,
+      "completions/max_terminated_length": 1818.0,
+      "completions/mean_length": 420.9910888671875,
+      "completions/mean_terminated_length": 420.9910888671875,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.6231622388444674,
+      "grad_norm": 0.7409844994544983,
+      "kl": 0.06365966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 69454067.0,
+      "reward": 1.3593751192092896,
+      "reward_std": 0.1385863721370697,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3202287554740906,
+      "step": 604
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 801.0,
+      "completions/max_terminated_length": 801.0,
+      "completions/mean_length": 434.39288330078125,
+      "completions/mean_terminated_length": 434.39288330078125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.6241939644054681,
+      "grad_norm": 0.7337531447410583,
+      "kl": 0.06573486328125,
+      "learning_rate": 1e-06,
+      "loss": -0.006,
+      "num_tokens": 69568001.0,
+      "reward": 1.2406251430511475,
+      "reward_std": 0.17304165661334991,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24062500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.30818644165992737,
+      "step": 605
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 995.0,
+      "completions/max_terminated_length": 995.0,
+      "completions/mean_length": 441.90179443359375,
+      "completions/mean_terminated_length": 441.90179443359375,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 0.6252256899664689,
+      "grad_norm": 0.8032611608505249,
+      "kl": 0.0662841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 69683322.0,
+      "reward": 1.28125,
+      "reward_std": 0.16203156113624573,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.29279062151908875,
+      "step": 606
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 923.0,
+      "completions/max_terminated_length": 923.0,
+      "completions/mean_length": 412.4732360839844,
+      "completions/mean_terminated_length": 412.4732360839844,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.6262574155274697,
+      "grad_norm": 0.7155545353889465,
+      "kl": 0.06622314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0184,
+      "num_tokens": 69786912.0,
+      "reward": 1.3312501907348633,
+      "reward_std": 0.15303204953670502,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3283267021179199,
+      "step": 607
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 972.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 443.21429443359375,
+      "completions/mean_terminated_length": 443.21429443359375,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.6272891410884704,
+      "grad_norm": 0.5675290822982788,
+      "kl": 0.0606689453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0193,
+      "num_tokens": 69904991.0,
+      "reward": 1.2750002145767212,
+      "reward_std": 0.070383720099926,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474,
+      "step": 608
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1062.0,
+      "completions/max_terminated_length": 1062.0,
+      "completions/mean_length": 471.83929443359375,
+      "completions/mean_terminated_length": 471.83929443359375,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.6283208666494713,
+      "grad_norm": 0.760869026184082,
+      "kl": 0.06317138671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0122,
+      "num_tokens": 70028656.0,
+      "reward": 1.240625023841858,
+      "reward_std": 0.1671050637960434,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24062499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.2780669629573822,
+      "step": 609
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 978.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 513.2678833007812,
+      "completions/mean_terminated_length": 513.2678833007812,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "epoch": 0.629352592210472,
+      "grad_norm": 0.7933395504951477,
+      "kl": 0.07415771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0297,
+      "num_tokens": 70160902.0,
+      "reward": 1.2781251668930054,
+      "reward_std": 0.18246570229530334,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.304972380399704,
+      "step": 610
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 674.0,
+      "completions/max_terminated_length": 674.0,
+      "completions/mean_length": 366.5000305175781,
+      "completions/mean_terminated_length": 366.5000305175781,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.6303843177714727,
+      "grad_norm": 0.8295329213142395,
+      "kl": 0.0723876953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0102,
+      "num_tokens": 70272286.0,
+      "reward": 1.3406251668930054,
+      "reward_std": 0.18557783961296082,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3025068938732147,
+      "step": 611
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1052.0,
+      "completions/max_terminated_length": 1052.0,
+      "completions/mean_length": 413.7500305175781,
+      "completions/mean_terminated_length": 413.7500305175781,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 0.6314160433324736,
+      "grad_norm": 0.8231275081634521,
+      "kl": 0.0662841796875,
+      "learning_rate": 1e-06,
+      "loss": -0.007,
+      "num_tokens": 70377048.0,
+      "reward": 1.3906251192092896,
+      "reward_std": 0.16967806220054626,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3906249701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.31073373556137085,
+      "step": 612
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 774.0,
+      "completions/max_terminated_length": 774.0,
+      "completions/mean_length": 424.6964416503906,
+      "completions/mean_terminated_length": 424.6964416503906,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 0.6324477688934743,
+      "grad_norm": 0.832097053527832,
+      "kl": 0.0731201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0028,
+      "num_tokens": 70494306.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.1668517291545868,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3123941123485565,
+      "step": 613
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 760.0,
+      "completions/max_terminated_length": 760.0,
+      "completions/mean_length": 480.26788330078125,
+      "completions/mean_terminated_length": 480.26788330078125,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "epoch": 0.6334794944544752,
+      "grad_norm": 0.7808151841163635,
+      "kl": 0.0631103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 70615478.0,
+      "reward": 1.1937501430511475,
+      "reward_std": 0.15479755401611328,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19374999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.26922687888145447,
+      "step": 614
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 737.0,
+      "completions/max_terminated_length": 737.0,
+      "completions/mean_length": 449.794677734375,
+      "completions/mean_terminated_length": 449.794677734375,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.6345112200154759,
+      "grad_norm": 0.7162142395973206,
+      "kl": 0.0618896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0203,
+      "num_tokens": 70734455.0,
+      "reward": 1.2437500953674316,
+      "reward_std": 0.13446195423603058,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24374999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.256634920835495,
+      "step": 615
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 918.0,
+      "completions/max_terminated_length": 918.0,
+      "completions/mean_length": 450.0000305175781,
+      "completions/mean_terminated_length": 450.0000305175781,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.6355429455764766,
+      "grad_norm": 0.7382791042327881,
+      "kl": 0.06561279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 70849122.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.18127289414405823,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3123941123485565,
+      "step": 616
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 447.8482360839844,
+      "completions/mean_terminated_length": 447.8482360839844,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.6365746711374775,
+      "grad_norm": 0.7692705392837524,
+      "kl": 0.05877685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0156,
+      "num_tokens": 70966614.0,
+      "reward": 1.1375001668930054,
+      "reward_std": 0.15338782966136932,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.22705517709255219,
+      "step": 617
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 941.0,
+      "completions/max_terminated_length": 941.0,
+      "completions/mean_length": 447.919677734375,
+      "completions/mean_terminated_length": 447.919677734375,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 0.6376063966984782,
+      "grad_norm": 0.7012402415275574,
+      "kl": 0.05401611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 71083981.0,
+      "reward": 1.3375002145767212,
+      "reward_std": 0.14874815940856934,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.2816057503223419,
+      "step": 618
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1107.0,
+      "completions/max_terminated_length": 1107.0,
+      "completions/mean_length": 466.1250305175781,
+      "completions/mean_terminated_length": 466.1250305175781,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.638638122259479,
+      "grad_norm": 0.6417734026908875,
+      "kl": 0.0552978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0219,
+      "num_tokens": 71207300.0,
+      "reward": 1.2468751668930054,
+      "reward_std": 0.12970300018787384,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2994951903820038,
+      "step": 619
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1298.0,
+      "completions/max_terminated_length": 1298.0,
+      "completions/mean_length": 463.857177734375,
+      "completions/mean_terminated_length": 463.857177734375,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.6396698478204798,
+      "grad_norm": 0.635471522808075,
+      "kl": 0.05859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0103,
+      "num_tokens": 71336189.0,
+      "reward": 1.2562501430511475,
+      "reward_std": 0.15377187728881836,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2562499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3288065195083618,
+      "step": 620
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1085.0,
+      "completions/max_terminated_length": 1085.0,
+      "completions/mean_length": 454.8214416503906,
+      "completions/mean_terminated_length": 454.8214416503906,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 0.6407015733814805,
+      "grad_norm": 0.6547529101371765,
+      "kl": 0.0552978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 71458634.0,
+      "reward": 1.2000000476837158,
+      "reward_std": 0.14462588727474213,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20000001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.29231905937194824,
+      "step": 621
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 666.0,
+      "completions/max_terminated_length": 666.0,
+      "completions/mean_length": 370.3214416503906,
+      "completions/mean_terminated_length": 370.3214416503906,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.6417332989424813,
+      "grad_norm": 0.7429718375205994,
+      "kl": 0.0694580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.014,
+      "num_tokens": 71569389.0,
+      "reward": 1.2750000953674316,
+      "reward_std": 0.1319045126438141,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3096059560775757,
+      "step": 622
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 811.0,
+      "completions/max_terminated_length": 811.0,
+      "completions/mean_length": 381.9821472167969,
+      "completions/mean_terminated_length": 381.9821472167969,
+      "completions/min_length": 123.0,
+      "completions/min_terminated_length": 123.0,
+      "epoch": 0.642765024503482,
+      "grad_norm": 0.9691736102104187,
+      "kl": 0.0660400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0141,
+      "num_tokens": 71678482.0,
+      "reward": 1.3718751668930054,
+      "reward_std": 0.18426603078842163,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37187501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3090803921222687,
+      "step": 623
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 776.0,
+      "completions/max_terminated_length": 776.0,
+      "completions/mean_length": 405.5535888671875,
+      "completions/mean_terminated_length": 405.5535888671875,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.6437967500644829,
+      "grad_norm": 0.809437096118927,
+      "kl": 0.06268310546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0038,
+      "num_tokens": 71792245.0,
+      "reward": 1.3125,
+      "reward_std": 0.14292655885219574,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.33006277680397034,
+      "step": 624
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1030.0,
+      "completions/max_terminated_length": 1030.0,
+      "completions/mean_length": 448.2857360839844,
+      "completions/mean_terminated_length": 448.2857360839844,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.6448284756254836,
+      "grad_norm": 0.7424851059913635,
+      "kl": 0.05975341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0028,
+      "num_tokens": 71915498.0,
+      "reward": 1.2687500715255737,
+      "reward_std": 0.1576671600341797,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26874998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3007591664791107,
+      "step": 625
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1024.0,
+      "completions/mean_length": 393.0357360839844,
+      "completions/mean_terminated_length": 393.0357360839844,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.6458602011864844,
+      "grad_norm": 0.8500249981880188,
+      "kl": 0.0582275390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0197,
+      "num_tokens": 72026235.0,
+      "reward": 1.3375002145767212,
+      "reward_std": 0.1441763937473297,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3374999463558197,
+      "rewards/curriculum_aware_reward_fn/std": 0.3005625307559967,
+      "step": 626
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 405.6339416503906,
+      "completions/mean_terminated_length": 405.6339416503906,
+      "completions/min_length": 125.0,
+      "completions/min_terminated_length": 125.0,
+      "epoch": 0.6468919267474852,
+      "grad_norm": 0.662321925163269,
+      "kl": 0.069580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0308,
+      "num_tokens": 72142373.0,
+      "reward": 1.2937501668930054,
+      "reward_std": 0.13335785269737244,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.31709015369415283,
+      "step": 627
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 740.0,
+      "completions/max_terminated_length": 740.0,
+      "completions/mean_length": 378.1250305175781,
+      "completions/mean_terminated_length": 378.1250305175781,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.6479236523084859,
+      "grad_norm": 0.5981107950210571,
+      "kl": 0.06591796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 72251867.0,
+      "reward": 1.2125000953674316,
+      "reward_std": 0.07582376897335052,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.3827061057090759,
+      "step": 628
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 967.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 358.5535888671875,
+      "completions/mean_terminated_length": 358.5535888671875,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.6489553778694868,
+      "grad_norm": 0.9436933994293213,
+      "kl": 0.06439208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0086,
+      "num_tokens": 72345008.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.1505720466375351,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3005625307559967,
+      "step": 629
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 731.0,
+      "completions/mean_length": 400.4821472167969,
+      "completions/mean_terminated_length": 367.189208984375,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.6499871034304875,
+      "grad_norm": 0.5673825740814209,
+      "kl": 0.0594482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0438,
+      "num_tokens": 72450139.0,
+      "reward": 1.378571629524231,
+      "reward_std": 0.089703768491745,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38749998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3267020583152771,
+      "step": 630
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 792.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 416.9464416503906,
+      "completions/mean_terminated_length": 416.9464416503906,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.6510188289914882,
+      "grad_norm": 0.6781731247901917,
+      "kl": 0.057373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 72560537.0,
+      "reward": 1.2093751430511475,
+      "reward_std": 0.1222841814160347,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20937499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.2988364100456238,
+      "step": 631
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 858.0,
+      "completions/max_terminated_length": 858.0,
+      "completions/mean_length": 455.1250305175781,
+      "completions/mean_terminated_length": 455.1250305175781,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.6520505545524891,
+      "grad_norm": 0.7097309231758118,
+      "kl": 0.05877685546875,
+      "learning_rate": 1e-06,
+      "loss": -0.012,
+      "num_tokens": 72678774.0,
+      "reward": 1.1687501668930054,
+      "reward_std": 0.13737669587135315,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16875001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.2738715410232544,
+      "step": 632
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 802.0,
+      "completions/max_terminated_length": 802.0,
+      "completions/mean_length": 399.3125305175781,
+      "completions/mean_terminated_length": 399.3125305175781,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.6530822801134898,
+      "grad_norm": 0.6966571807861328,
+      "kl": 0.05914306640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0097,
+      "num_tokens": 72794778.0,
+      "reward": 1.2218750715255737,
+      "reward_std": 0.1375676989555359,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.300808310508728,
+      "step": 633
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 853.0,
+      "completions/max_terminated_length": 853.0,
+      "completions/mean_length": 408.9821472167969,
+      "completions/mean_terminated_length": 408.9821472167969,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 0.6541140056744906,
+      "grad_norm": 0.677052915096283,
+      "kl": 0.06536865234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0176,
+      "num_tokens": 72907343.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.14504189789295197,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3114938735961914,
+      "step": 634
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1008.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 427.0982360839844,
+      "completions/mean_terminated_length": 427.0982360839844,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 0.6551457312354914,
+      "grad_norm": 0.7582041621208191,
+      "kl": 0.0594482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 73027782.0,
+      "reward": 1.1968750953674316,
+      "reward_std": 0.14314253628253937,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2730608284473419,
+      "step": 635
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 870.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 393.33929443359375,
+      "completions/mean_terminated_length": 393.33929443359375,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 0.6561774567964921,
+      "grad_norm": 0.832430362701416,
+      "kl": 0.0611572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0177,
+      "num_tokens": 73141605.0,
+      "reward": 1.2750000953674316,
+      "reward_std": 0.1692698895931244,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2749999761581421,
+      "rewards/curriculum_aware_reward_fn/std": 0.302392840385437,
+      "step": 636
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1235.0,
+      "completions/max_terminated_length": 1235.0,
+      "completions/mean_length": 443.5357360839844,
+      "completions/mean_terminated_length": 443.5357360839844,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "epoch": 0.6572091823574929,
+      "grad_norm": 0.7490401268005371,
+      "kl": 0.05474853515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 73264844.0,
+      "reward": 1.1906250715255737,
+      "reward_std": 0.13961127400398254,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19062499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.27749940752983093,
+      "step": 637
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 774.0,
+      "completions/max_terminated_length": 774.0,
+      "completions/mean_length": 397.8125305175781,
+      "completions/mean_terminated_length": 397.8125305175781,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.6582409079184937,
+      "grad_norm": 0.6362108588218689,
+      "kl": 0.0712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0114,
+      "num_tokens": 73380897.0,
+      "reward": 1.1968750953674316,
+      "reward_std": 0.1181226521730423,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.28102782368659973,
+      "step": 638
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 775.0,
+      "completions/max_terminated_length": 775.0,
+      "completions/mean_length": 378.5446472167969,
+      "completions/mean_terminated_length": 378.5446472167969,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.6592726334794945,
+      "grad_norm": 0.8100889325141907,
+      "kl": 0.0693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0075,
+      "num_tokens": 73497585.0,
+      "reward": 1.2156251668930054,
+      "reward_std": 0.15342311561107635,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2867204546928406,
+      "step": 639
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1265.0,
+      "completions/max_terminated_length": 1265.0,
+      "completions/mean_length": 457.0714416503906,
+      "completions/mean_terminated_length": 457.0714416503906,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.6603043590404952,
+      "grad_norm": 0.6374899744987488,
+      "kl": 0.0521240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 73629291.0,
+      "reward": 1.2781251668930054,
+      "reward_std": 0.1284002810716629,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.32596227526664734,
+      "step": 640
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 814.0,
+      "completions/max_terminated_length": 814.0,
+      "completions/mean_length": 385.7946472167969,
+      "completions/mean_terminated_length": 385.7946472167969,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.661336084601496,
+      "grad_norm": 0.7600776553153992,
+      "kl": 0.072021484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 73740064.0,
+      "reward": 1.2218750715255737,
+      "reward_std": 0.12513509392738342,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.28959283232688904,
+      "step": 641
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 712.0,
+      "completions/max_terminated_length": 712.0,
+      "completions/mean_length": 418.40179443359375,
+      "completions/mean_terminated_length": 418.40179443359375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.6623678101624968,
+      "grad_norm": 0.6850177645683289,
+      "kl": 0.0572509765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 73855861.0,
+      "reward": 1.371875286102295,
+      "reward_std": 0.15071289241313934,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37187501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3364347517490387,
+      "step": 642
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 843.0,
+      "completions/max_terminated_length": 843.0,
+      "completions/mean_length": 375.8571472167969,
+      "completions/mean_terminated_length": 375.8571472167969,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.6633995357234975,
+      "grad_norm": 0.7585674524307251,
+      "kl": 0.058349609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0225,
+      "num_tokens": 73962987.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.1872883141040802,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3271692097187042,
+      "step": 643
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1148.0,
+      "completions/max_terminated_length": 1148.0,
+      "completions/mean_length": 409.4375305175781,
+      "completions/mean_terminated_length": 409.4375305175781,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 0.6644312612844984,
+      "grad_norm": 0.6967830061912537,
+      "kl": 0.05999755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 74079950.0,
+      "reward": 1.2312501668930054,
+      "reward_std": 0.13037815690040588,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23124997317790985,
+      "rewards/curriculum_aware_reward_fn/std": 0.28790363669395447,
+      "step": 644
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 915.0,
+      "completions/max_terminated_length": 915.0,
+      "completions/mean_length": 408.7232360839844,
+      "completions/mean_terminated_length": 408.7232360839844,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 0.6654629868454991,
+      "grad_norm": 0.6875054240226746,
+      "kl": 0.0599365234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0128,
+      "num_tokens": 74191471.0,
+      "reward": 1.309375286102295,
+      "reward_std": 0.11628562211990356,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3177575469017029,
+      "step": 645
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 852.0,
+      "completions/max_terminated_length": 852.0,
+      "completions/mean_length": 397.5446472167969,
+      "completions/mean_terminated_length": 397.5446472167969,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.6664947124064998,
+      "grad_norm": 0.6609368920326233,
+      "kl": 0.064208984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0132,
+      "num_tokens": 74306584.0,
+      "reward": 1.2375000715255737,
+      "reward_std": 0.1822759509086609,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23749999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.3193814158439636,
+      "step": 646
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 800.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 392.6696472167969,
+      "completions/mean_terminated_length": 392.6696472167969,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 0.6675264379675007,
+      "grad_norm": 0.8001618385314941,
+      "kl": 0.07421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0138,
+      "num_tokens": 74416706.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.15462636947631836,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.30082467198371887,
+      "step": 647
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 805.0,
+      "completions/mean_length": 478.0625305175781,
+      "completions/mean_terminated_length": 445.4684753417969,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.6685581635285014,
+      "grad_norm": 0.7765113115310669,
+      "kl": 0.0609130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0444,
+      "num_tokens": 74536665.0,
+      "reward": 1.2191965579986572,
+      "reward_std": 0.20141837000846863,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.27678829431533813,
+      "step": 648
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1479.0,
+      "completions/max_terminated_length": 1479.0,
+      "completions/mean_length": 422.0982360839844,
+      "completions/mean_terminated_length": 422.0982360839844,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.6695898890895022,
+      "grad_norm": 0.6240831017494202,
+      "kl": 0.06243896484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0382,
+      "num_tokens": 74654178.0,
+      "reward": 1.3718751668930054,
+      "reward_std": 0.08518895506858826,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37187498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3126305937767029,
+      "step": 649
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1192.0,
+      "completions/max_terminated_length": 1192.0,
+      "completions/mean_length": 387.9732360839844,
+      "completions/mean_terminated_length": 387.9732360839844,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.670621614650503,
+      "grad_norm": 0.8223195672035217,
+      "kl": 0.06793212890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0082,
+      "num_tokens": 74756662.0,
+      "reward": 1.4410717487335205,
+      "reward_std": 0.2007153332233429,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44999998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3166548013687134,
+      "step": 650
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 432.2589416503906,
+      "completions/mean_terminated_length": 432.2589416503906,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 0.6716533402115037,
+      "grad_norm": 0.5505730509757996,
+      "kl": 0.06182861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0005,
+      "num_tokens": 74881165.0,
+      "reward": 1.1875,
+      "reward_std": 0.0919622927904129,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.2854978144168854,
+      "step": 651
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 821.0,
+      "completions/max_terminated_length": 821.0,
+      "completions/mean_length": 416.5714416503906,
+      "completions/mean_terminated_length": 416.5714416503906,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 0.6726850657725045,
+      "grad_norm": 0.5758817791938782,
+      "kl": 0.06842041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0103,
+      "num_tokens": 74989302.0,
+      "reward": 1.2375000715255737,
+      "reward_std": 0.11903390288352966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23749999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.2979282736778259,
+      "step": 652
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 791.0,
+      "completions/max_terminated_length": 791.0,
+      "completions/mean_length": 394.2857360839844,
+      "completions/mean_terminated_length": 394.2857360839844,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 0.6737167913335053,
+      "grad_norm": 0.635651171207428,
+      "kl": 0.0662841796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0116,
+      "num_tokens": 75100564.0,
+      "reward": 1.1660715341567993,
+      "reward_std": 0.15635573863983154,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2896098494529724,
+      "step": 653
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 620.0,
+      "completions/max_terminated_length": 620.0,
+      "completions/mean_length": 371.95538330078125,
+      "completions/mean_terminated_length": 371.95538330078125,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 0.6747485168945061,
+      "grad_norm": 0.6044211387634277,
+      "kl": 0.06475830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.02,
+      "num_tokens": 75211943.0,
+      "reward": 1.3000000715255737,
+      "reward_std": 0.12482405453920364,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30000001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.32159513235092163,
+      "step": 654
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 646.0,
+      "completions/max_terminated_length": 646.0,
+      "completions/mean_length": 343.58929443359375,
+      "completions/mean_terminated_length": 343.58929443359375,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.6757802424555068,
+      "grad_norm": 0.8712995052337646,
+      "kl": 0.067138671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0003,
+      "num_tokens": 75312683.0,
+      "reward": 1.325000286102295,
+      "reward_std": 0.16854573786258698,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3211045265197754,
+      "step": 655
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1504.0,
+      "completions/max_terminated_length": 1504.0,
+      "completions/mean_length": 459.7500305175781,
+      "completions/mean_terminated_length": 459.7500305175781,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.6768119680165076,
+      "grad_norm": 0.8327785134315491,
+      "kl": 0.0638427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0131,
+      "num_tokens": 75435694.0,
+      "reward": 1.2375000715255737,
+      "reward_std": 0.17822317779064178,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23749998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2979282736778259,
+      "step": 656
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 860.0,
+      "completions/max_terminated_length": 860.0,
+      "completions/mean_length": 390.7232360839844,
+      "completions/mean_terminated_length": 390.7232360839844,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.6778436935775084,
+      "grad_norm": 0.580292284488678,
+      "kl": 0.06488037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0074,
+      "num_tokens": 75545927.0,
+      "reward": 1.343750238418579,
+      "reward_std": 0.08745487779378891,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.2970671057701111,
+      "step": 657
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 943.0,
+      "completions/max_terminated_length": 943.0,
+      "completions/mean_length": 421.8571472167969,
+      "completions/mean_terminated_length": 421.8571472167969,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.6788754191385091,
+      "grad_norm": 0.5890258550643921,
+      "kl": 0.071533203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0057,
+      "num_tokens": 75664295.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.11732659488916397,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.2639039158821106,
+      "step": 658
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 984.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 415.89288330078125,
+      "completions/mean_terminated_length": 415.89288330078125,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 0.67990714469951,
+      "grad_norm": 0.6905220746994019,
+      "kl": 0.06536865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 75778712.0,
+      "reward": 1.312500238418579,
+      "reward_std": 0.1588037759065628,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3021320700645447,
+      "step": 659
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1382.0,
+      "completions/max_terminated_length": 1382.0,
+      "completions/mean_length": 440.4910888671875,
+      "completions/mean_terminated_length": 440.4910888671875,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 0.6809388702605107,
+      "grad_norm": 0.7469112277030945,
+      "kl": 0.06982421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0094,
+      "num_tokens": 75902331.0,
+      "reward": 1.403125286102295,
+      "reward_std": 0.13282155990600586,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3193660080432892,
+      "step": 660
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1259.0,
+      "completions/max_terminated_length": 1259.0,
+      "completions/mean_length": 421.3125305175781,
+      "completions/mean_terminated_length": 421.3125305175781,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.6819705958215114,
+      "grad_norm": 0.7019675970077515,
+      "kl": 0.0772705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 76018650.0,
+      "reward": 1.25,
+      "reward_std": 0.14943593740463257,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2499999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.32012102007865906,
+      "step": 661
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1330.0,
+      "completions/max_terminated_length": 1330.0,
+      "completions/mean_length": 436.3214416503906,
+      "completions/mean_terminated_length": 436.3214416503906,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.6830023213825123,
+      "grad_norm": 0.6849479079246521,
+      "kl": 0.0711669921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0196,
+      "num_tokens": 76146828.0,
+      "reward": 1.2281250953674316,
+      "reward_std": 0.12466341257095337,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2885020077228546,
+      "step": 662
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 876.0,
+      "completions/max_terminated_length": 876.0,
+      "completions/mean_length": 432.1696472167969,
+      "completions/mean_terminated_length": 432.1696472167969,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.684034046943513,
+      "grad_norm": 0.6404550671577454,
+      "kl": 0.07568359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0015,
+      "num_tokens": 76259113.0,
+      "reward": 1.3937500715255737,
+      "reward_std": 0.12282286584377289,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.38203608989715576,
+      "step": 663
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1048.0,
+      "completions/max_terminated_length": 1048.0,
+      "completions/mean_length": 444.96429443359375,
+      "completions/mean_terminated_length": 444.96429443359375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.6850657725045138,
+      "grad_norm": 0.602090060710907,
+      "kl": 0.0660400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0027,
+      "num_tokens": 76381840.0,
+      "reward": 1.171875,
+      "reward_std": 0.10328420996665955,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.26574569940567017,
+      "step": 664
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 847.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 415.9285888671875,
+      "completions/mean_terminated_length": 415.9285888671875,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 0.6860974980655146,
+      "grad_norm": 0.6655356884002686,
+      "kl": 0.0723876953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 76496856.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.1583477109670639,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.32055166363716125,
+      "step": 665
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 738.0,
+      "completions/max_terminated_length": 738.0,
+      "completions/mean_length": 388.5625305175781,
+      "completions/mean_terminated_length": 388.5625305175781,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 0.6871292236265153,
+      "grad_norm": 0.825885534286499,
+      "kl": 0.075927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0051,
+      "num_tokens": 76604285.0,
+      "reward": 1.418750286102295,
+      "reward_std": 0.1810518056154251,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41875001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.31106650829315186,
+      "step": 666
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 936.0,
+      "completions/max_terminated_length": 936.0,
+      "completions/mean_length": 420.08929443359375,
+      "completions/mean_terminated_length": 420.08929443359375,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 0.6881609491875161,
+      "grad_norm": 0.6745756268501282,
+      "kl": 0.08447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0279,
+      "num_tokens": 76715175.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.138127863407135,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.32390013337135315,
+      "step": 667
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1529.0,
+      "completions/max_terminated_length": 1529.0,
+      "completions/mean_length": 478.8750305175781,
+      "completions/mean_terminated_length": 478.8750305175781,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.6891926747485169,
+      "grad_norm": 0.7624357342720032,
+      "kl": 0.0677490234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0084,
+      "num_tokens": 76839005.0,
+      "reward": 1.2750000953674316,
+      "reward_std": 0.2256644070148468,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3466051518917084,
+      "step": 668
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 842.0,
+      "completions/max_terminated_length": 842.0,
+      "completions/mean_length": 441.8035888671875,
+      "completions/mean_terminated_length": 441.8035888671875,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 0.6902244003095177,
+      "grad_norm": 0.6996376514434814,
+      "kl": 0.07421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 76950346.0,
+      "reward": 1.3218752145767212,
+      "reward_std": 0.13565517961978912,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31212589144706726,
+      "step": 669
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 956.0,
+      "completions/max_terminated_length": 956.0,
+      "completions/mean_length": 488.27679443359375,
+      "completions/mean_terminated_length": 488.27679443359375,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 0.6912561258705184,
+      "grad_norm": 0.5711653232574463,
+      "kl": 0.07177734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 77079044.0,
+      "reward": 1.2129465341567993,
+      "reward_std": 0.14019827544689178,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3044550120830536,
+      "step": 670
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1641.0,
+      "completions/max_terminated_length": 1641.0,
+      "completions/mean_length": 510.6964416503906,
+      "completions/mean_terminated_length": 510.6964416503906,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.6922878514315192,
+      "grad_norm": 0.6841708421707153,
+      "kl": 0.06170654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0167,
+      "num_tokens": 77211828.0,
+      "reward": 1.1750000715255737,
+      "reward_std": 0.1548759937286377,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.25300002098083496,
+      "step": 671
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1008.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 465.857177734375,
+      "completions/mean_terminated_length": 465.857177734375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.69331957699252,
+      "grad_norm": 0.6161367893218994,
+      "kl": 0.07373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0078,
+      "num_tokens": 77343216.0,
+      "reward": 1.2437502145767212,
+      "reward_std": 0.1301034688949585,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24374999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.3180830180644989,
+      "step": 672
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 840.0,
+      "completions/max_terminated_length": 840.0,
+      "completions/mean_length": 434.6785888671875,
+      "completions/mean_terminated_length": 434.6785888671875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.6943513025535207,
+      "grad_norm": 0.7548561096191406,
+      "kl": 0.07275390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0053,
+      "num_tokens": 77455656.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.1485518515110016,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2914920449256897,
+      "step": 673
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1407.0,
+      "completions/max_terminated_length": 1407.0,
+      "completions/mean_length": 464.14288330078125,
+      "completions/mean_terminated_length": 464.14288330078125,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.6953830281145216,
+      "grad_norm": 0.784842312335968,
+      "kl": 0.071533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0229,
+      "num_tokens": 77573895.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.22508502006530762,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3245232105255127,
+      "step": 674
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 461.7589416503906,
+      "completions/mean_terminated_length": 461.7589416503906,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.6964147536755223,
+      "grad_norm": 0.6696816086769104,
+      "kl": 0.0767822265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 77698622.0,
+      "reward": 1.171875,
+      "reward_std": 0.10575476288795471,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.2615598738193512,
+      "step": 675
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1178.0,
+      "completions/max_terminated_length": 1178.0,
+      "completions/mean_length": 535.7053833007812,
+      "completions/mean_terminated_length": 535.7053833007812,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 0.697446479236523,
+      "grad_norm": 0.6169516444206238,
+      "kl": 0.065673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0181,
+      "num_tokens": 77841053.0,
+      "reward": 1.1812500953674316,
+      "reward_std": 0.153910830616951,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.28181561827659607,
+      "step": 676
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 887.0,
+      "completions/max_terminated_length": 887.0,
+      "completions/mean_length": 446.732177734375,
+      "completions/mean_terminated_length": 446.732177734375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.6984782047975239,
+      "grad_norm": 0.6587204933166504,
+      "kl": 0.0693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0321,
+      "num_tokens": 77962396.0,
+      "reward": 1.2687500715255737,
+      "reward_std": 0.14465731382369995,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26875001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.32543283700942993,
+      "step": 677
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1089.0,
+      "completions/max_terminated_length": 1089.0,
+      "completions/mean_length": 489.3125305175781,
+      "completions/mean_terminated_length": 489.3125305175781,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 0.6995099303585246,
+      "grad_norm": 0.48290061950683594,
+      "kl": 0.066650390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0171,
+      "num_tokens": 78094500.0,
+      "reward": 1.1218751668930054,
+      "reward_std": 0.07095565646886826,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.22388675808906555,
+      "step": 678
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 859.0,
+      "completions/max_terminated_length": 859.0,
+      "completions/mean_length": 459.2410888671875,
+      "completions/mean_terminated_length": 459.2410888671875,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 0.7005416559195254,
+      "grad_norm": 0.7850268483161926,
+      "kl": 0.070556640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0026,
+      "num_tokens": 78214591.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.18927761912345886,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3027673363685608,
+      "step": 679
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1009.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 484.6160888671875,
+      "completions/mean_terminated_length": 484.6160888671875,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.7015733814805262,
+      "grad_norm": 0.6543923020362854,
+      "kl": 0.0738525390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0326,
+      "num_tokens": 78343796.0,
+      "reward": 1.2254464626312256,
+      "reward_std": 0.18909485638141632,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.234375,
+      "rewards/curriculum_aware_reward_fn/std": 0.2948530614376068,
+      "step": 680
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1448.0,
+      "completions/max_terminated_length": 1448.0,
+      "completions/mean_length": 520.3928833007812,
+      "completions/mean_terminated_length": 520.3928833007812,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.7026051070415269,
+      "grad_norm": 0.7107188701629639,
+      "kl": 0.0721435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 78476322.0,
+      "reward": 1.2410715818405151,
+      "reward_std": 0.15162546932697296,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.28742408752441406,
+      "step": 681
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1740.0,
+      "completions/max_terminated_length": 1740.0,
+      "completions/mean_length": 584.9553833007812,
+      "completions/mean_terminated_length": 584.9553833007812,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.7036368326025277,
+      "grad_norm": 0.7118996381759644,
+      "kl": 0.063232421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0138,
+      "num_tokens": 78611522.0,
+      "reward": 1.2468751668930054,
+      "reward_std": 0.17215028405189514,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.30315765738487244,
+      "step": 682
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 926.0,
+      "completions/max_terminated_length": 926.0,
+      "completions/mean_length": 475.76788330078125,
+      "completions/mean_terminated_length": 475.76788330078125,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 0.7046685581635285,
+      "grad_norm": 0.7987418174743652,
+      "kl": 0.078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0297,
+      "num_tokens": 78730902.0,
+      "reward": 1.3218750953674316,
+      "reward_std": 0.2143011838197708,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.29764699935913086,
+      "step": 683
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1221.0,
+      "completions/max_terminated_length": 1221.0,
+      "completions/mean_length": 487.83038330078125,
+      "completions/mean_terminated_length": 487.83038330078125,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.7057002837245293,
+      "grad_norm": 0.7103044986724854,
+      "kl": 0.070556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0213,
+      "num_tokens": 78849337.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.16721250116825104,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.2979282736778259,
+      "step": 684
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1761.0,
+      "completions/max_terminated_length": 1761.0,
+      "completions/mean_length": 539.0,
+      "completions/mean_terminated_length": 539.0,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 0.70673200928553,
+      "grad_norm": 0.4926302134990692,
+      "kl": 0.0657958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0254,
+      "num_tokens": 78977080.0,
+      "reward": 1.296875,
+      "reward_std": 0.11531693488359451,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3052307665348053,
+      "step": 685
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1386.0,
+      "completions/max_terminated_length": 1386.0,
+      "completions/mean_length": 535.2232666015625,
+      "completions/mean_terminated_length": 535.2232666015625,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.7077637348465308,
+      "grad_norm": 0.9094158411026001,
+      "kl": 0.0721435546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 79110476.0,
+      "reward": 1.28125,
+      "reward_std": 0.13001351058483124,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.32832667231559753,
+      "step": 686
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 914.0,
+      "completions/max_terminated_length": 914.0,
+      "completions/mean_length": 448.3035888671875,
+      "completions/mean_terminated_length": 448.3035888671875,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.7087954604075316,
+      "grad_norm": 0.7349770069122314,
+      "kl": 0.07373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0114,
+      "num_tokens": 79221187.0,
+      "reward": 1.3508931398391724,
+      "reward_std": 0.2023835927248001,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3687500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30023449659347534,
+      "step": 687
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 925.0,
+      "completions/max_terminated_length": 925.0,
+      "completions/mean_length": 497.8125305175781,
+      "completions/mean_terminated_length": 497.8125305175781,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 0.7098271859685323,
+      "grad_norm": 0.5558754205703735,
+      "kl": 0.0653076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0001,
+      "num_tokens": 79336057.0,
+      "reward": 1.21875,
+      "reward_std": 0.11050201207399368,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21875,
+      "rewards/curriculum_aware_reward_fn/std": 0.32251298427581787,
+      "step": 688
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1387.0,
+      "completions/max_terminated_length": 1387.0,
+      "completions/mean_length": 486.3750305175781,
+      "completions/mean_terminated_length": 486.3750305175781,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "epoch": 0.7108589115295332,
+      "grad_norm": 0.6595085263252258,
+      "kl": 0.079833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0301,
+      "num_tokens": 79465298.0,
+      "reward": 1.4093750715255737,
+      "reward_std": 0.14184318482875824,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40937498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552,
+      "step": 689
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 909.0,
+      "completions/max_terminated_length": 909.0,
+      "completions/mean_length": 480.01788330078125,
+      "completions/mean_terminated_length": 480.01788330078125,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 0.7118906370905339,
+      "grad_norm": 0.7275698781013489,
+      "kl": 0.0732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 79579050.0,
+      "reward": 1.296875238418579,
+      "reward_std": 0.21182642877101898,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3123783469200134,
+      "step": 690
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1623.0,
+      "completions/max_terminated_length": 1623.0,
+      "completions/mean_length": 486.669677734375,
+      "completions/mean_terminated_length": 486.669677734375,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 0.7129223626515347,
+      "grad_norm": 0.7305976748466492,
+      "kl": 0.076416015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0256,
+      "num_tokens": 79693022.0,
+      "reward": 1.3004463911056519,
+      "reward_std": 0.1608843356370926,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.32121187448501587,
+      "step": 691
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1125.0,
+      "completions/max_terminated_length": 1125.0,
+      "completions/mean_length": 468.2589416503906,
+      "completions/mean_terminated_length": 468.2589416503906,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 0.7139540882125355,
+      "grad_norm": 0.7382761836051941,
+      "kl": 0.0733642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0329,
+      "num_tokens": 79808166.0,
+      "reward": 1.375000238418579,
+      "reward_std": 0.15719659626483917,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3106227219104767,
+      "step": 692
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 696.0,
+      "completions/max_terminated_length": 696.0,
+      "completions/mean_length": 423.52679443359375,
+      "completions/mean_terminated_length": 423.52679443359375,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 0.7149858137735362,
+      "grad_norm": 0.7762972712516785,
+      "kl": 0.0870361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0149,
+      "num_tokens": 79922514.0,
+      "reward": 1.3593751192092896,
+      "reward_std": 0.18821999430656433,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.33702000975608826,
+      "step": 693
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 914.0,
+      "completions/max_terminated_length": 914.0,
+      "completions/mean_length": 455.6607360839844,
+      "completions/mean_terminated_length": 455.6607360839844,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 0.716017539334537,
+      "grad_norm": 0.6777034997940063,
+      "kl": 0.0794677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 80039530.0,
+      "reward": 1.359375238418579,
+      "reward_std": 0.1436331868171692,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.33702000975608826,
+      "step": 694
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 816.0,
+      "completions/max_terminated_length": 816.0,
+      "completions/mean_length": 424.7232360839844,
+      "completions/mean_terminated_length": 424.7232360839844,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.7170492648955378,
+      "grad_norm": 0.6738489866256714,
+      "kl": 0.07177734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0337,
+      "num_tokens": 80153732.0,
+      "reward": 1.4812501668930054,
+      "reward_std": 0.17275573313236237,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48124998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.37621423602104187,
+      "step": 695
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1146.0,
+      "completions/max_terminated_length": 1146.0,
+      "completions/mean_length": 502.5357360839844,
+      "completions/mean_terminated_length": 502.5357360839844,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.7180809904565386,
+      "grad_norm": 0.7249394059181213,
+      "kl": 0.05865478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0035,
+      "num_tokens": 80277055.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.15655311942100525,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.31911906599998474,
+      "step": 696
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1049.0,
+      "completions/max_terminated_length": 1049.0,
+      "completions/mean_length": 483.26788330078125,
+      "completions/mean_terminated_length": 483.26788330078125,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.7191127160175393,
+      "grad_norm": 0.6074086427688599,
+      "kl": 0.06689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 80405122.0,
+      "reward": 1.296875,
+      "reward_std": 0.08536109328269958,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3052307665348053,
+      "step": 697
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 933.0,
+      "completions/max_terminated_length": 933.0,
+      "completions/mean_length": 482.232177734375,
+      "completions/mean_terminated_length": 482.232177734375,
+      "completions/min_length": 282.0,
+      "completions/min_terminated_length": 282.0,
+      "epoch": 0.7201444415785401,
+      "grad_norm": 0.571091890335083,
+      "kl": 0.078125,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 80536384.0,
+      "reward": 1.2062500715255737,
+      "reward_std": 0.11699223518371582,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20624999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.291711688041687,
+      "step": 698
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 490.2410888671875,
+      "completions/mean_terminated_length": 490.2410888671875,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.7211761671395409,
+      "grad_norm": 0.7024215459823608,
+      "kl": 0.0699462890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 80666373.0,
+      "reward": 1.25,
+      "reward_std": 0.167774498462677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.3060206472873688,
+      "step": 699
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 838.0,
+      "completions/max_terminated_length": 838.0,
+      "completions/mean_length": 434.8839416503906,
+      "completions/mean_terminated_length": 434.8839416503906,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.7222078927005416,
+      "grad_norm": 0.7631708383560181,
+      "kl": 0.0819091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0054,
+      "num_tokens": 80787369.0,
+      "reward": 1.3125,
+      "reward_std": 0.16144998371601105,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.31987470388412476,
+      "step": 700
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 831.0,
+      "completions/max_terminated_length": 831.0,
+      "completions/mean_length": 440.8750305175781,
+      "completions/mean_terminated_length": 440.8750305175781,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.7232396182615425,
+      "grad_norm": 0.7090742588043213,
+      "kl": 0.080322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 80901896.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.1540508270263672,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378,
+      "step": 701
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 962.0,
+      "completions/max_terminated_length": 962.0,
+      "completions/mean_length": 463.01788330078125,
+      "completions/mean_terminated_length": 463.01788330078125,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.7242713438225432,
+      "grad_norm": 0.7182156443595886,
+      "kl": 0.0772705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0186,
+      "num_tokens": 81022494.0,
+      "reward": 1.3312500715255737,
+      "reward_std": 0.1629144847393036,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3249480128288269,
+      "step": 702
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 979.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 460.4285888671875,
+      "completions/mean_terminated_length": 460.4285888671875,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.7253030693835439,
+      "grad_norm": 0.6034790277481079,
+      "kl": 0.0723876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0095,
+      "num_tokens": 81138560.0,
+      "reward": 1.2531250715255737,
+      "reward_std": 0.10742086172103882,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3228031396865845,
+      "step": 703
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 837.0,
+      "completions/max_terminated_length": 837.0,
+      "completions/mean_length": 455.27679443359375,
+      "completions/mean_terminated_length": 455.27679443359375,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.7263347949445448,
+      "grad_norm": 0.6957827806472778,
+      "kl": 0.076171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 81260041.0,
+      "reward": 1.4406250715255737,
+      "reward_std": 0.1609530746936798,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29243704676628113,
+      "step": 704
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 991.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 494.1785888671875,
+      "completions/mean_terminated_length": 494.1785888671875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 0.7273665205055455,
+      "grad_norm": 0.6472601294517517,
+      "kl": 0.0770263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0421,
+      "num_tokens": 81386292.0,
+      "reward": 1.2625001668930054,
+      "reward_std": 0.1701442450284958,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26250001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.302653431892395,
+      "step": 705
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 739.0,
+      "completions/max_terminated_length": 739.0,
+      "completions/mean_length": 435.76788330078125,
+      "completions/mean_terminated_length": 435.76788330078125,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.7283982460665464,
+      "grad_norm": 0.6905910968780518,
+      "kl": 0.09326171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0209,
+      "num_tokens": 81497958.0,
+      "reward": 1.3312500715255737,
+      "reward_std": 0.11016502231359482,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.32832667231559753,
+      "step": 706
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 679.0,
+      "completions/max_terminated_length": 679.0,
+      "completions/mean_length": 417.83929443359375,
+      "completions/mean_terminated_length": 417.83929443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.7294299716275471,
+      "grad_norm": 0.7361639738082886,
+      "kl": 0.082275390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0271,
+      "num_tokens": 81609688.0,
+      "reward": 1.3875001668930054,
+      "reward_std": 0.17972160875797272,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38749998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3968626856803894,
+      "step": 707
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1104.0,
+      "completions/max_terminated_length": 1104.0,
+      "completions/mean_length": 489.4375305175781,
+      "completions/mean_terminated_length": 489.4375305175781,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 0.7304616971885478,
+      "grad_norm": 0.5894014239311218,
+      "kl": 0.077880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0012,
+      "num_tokens": 81734190.0,
+      "reward": 1.25,
+      "reward_std": 0.13956649601459503,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2499999850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.3369176685810089,
+      "step": 708
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 682.0,
+      "completions/max_terminated_length": 682.0,
+      "completions/mean_length": 424.9821472167969,
+      "completions/mean_terminated_length": 424.9821472167969,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.7314934227495486,
+      "grad_norm": 0.7483744621276855,
+      "kl": 0.0867919921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 81851144.0,
+      "reward": 1.3937500715255737,
+      "reward_std": 0.15640275180339813,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39375001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.29759734869003296,
+      "step": 709
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 734.0,
+      "completions/max_terminated_length": 734.0,
+      "completions/mean_length": 415.7321472167969,
+      "completions/mean_terminated_length": 415.7321472167969,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.7325251483105494,
+      "grad_norm": 0.620214581489563,
+      "kl": 0.08349609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0048,
+      "num_tokens": 81966327.0,
+      "reward": 1.271875023841858,
+      "reward_std": 0.1221185103058815,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.29975825548171997,
+      "step": 710
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1323.0,
+      "completions/max_terminated_length": 1323.0,
+      "completions/mean_length": 478.6964416503906,
+      "completions/mean_terminated_length": 478.6964416503906,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.7335568738715502,
+      "grad_norm": 0.6635290384292603,
+      "kl": 0.0855712890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0056,
+      "num_tokens": 82089682.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.14508238434791565,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3063907325267792,
+      "step": 711
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 617.0,
+      "completions/max_terminated_length": 617.0,
+      "completions/mean_length": 422.2589416503906,
+      "completions/mean_terminated_length": 422.2589416503906,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.734588599432551,
+      "grad_norm": 0.7747433185577393,
+      "kl": 0.0855712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 82209649.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.14272122085094452,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3071615993976593,
+      "step": 712
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 965.0,
+      "completions/max_terminated_length": 965.0,
+      "completions/mean_length": 475.58038330078125,
+      "completions/mean_terminated_length": 475.58038330078125,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 0.7356203249935517,
+      "grad_norm": 0.6959653496742249,
+      "kl": 0.0831298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0022,
+      "num_tokens": 82323646.0,
+      "reward": 1.1906250715255737,
+      "reward_std": 0.11970683187246323,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19062498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.23904192447662354,
+      "step": 713
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 799.0,
+      "completions/max_terminated_length": 799.0,
+      "completions/mean_length": 404.7946472167969,
+      "completions/mean_terminated_length": 404.7946472167969,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.7366520505545525,
+      "grad_norm": 0.7410394549369812,
+      "kl": 0.0875244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0261,
+      "num_tokens": 82431325.0,
+      "reward": 1.3906251192092896,
+      "reward_std": 0.1362684816122055,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.390625,
+      "rewards/curriculum_aware_reward_fn/std": 0.353905588388443,
+      "step": 714
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1029.0,
+      "completions/max_terminated_length": 1029.0,
+      "completions/mean_length": 425.1696472167969,
+      "completions/mean_terminated_length": 425.1696472167969,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.7376837761155532,
+      "grad_norm": 0.7336283922195435,
+      "kl": 0.091064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0166,
+      "num_tokens": 82544219.0,
+      "reward": 1.343750238418579,
+      "reward_std": 0.1931782364845276,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3354521691799164,
+      "step": 715
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 851.0,
+      "completions/max_terminated_length": 851.0,
+      "completions/mean_length": 437.21429443359375,
+      "completions/mean_terminated_length": 437.21429443359375,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.7387155016765541,
+      "grad_norm": 0.6753544211387634,
+      "kl": 0.0867919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0181,
+      "num_tokens": 82653919.0,
+      "reward": 1.3343751430511475,
+      "reward_std": 0.13460318744182587,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3343749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.2910861372947693,
+      "step": 716
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1049.0,
+      "completions/max_terminated_length": 1049.0,
+      "completions/mean_length": 502.1785888671875,
+      "completions/mean_terminated_length": 502.1785888671875,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.7397472272375548,
+      "grad_norm": 0.751768171787262,
+      "kl": 0.0675048828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0019,
+      "num_tokens": 82781227.0,
+      "reward": 1.250000238418579,
+      "reward_std": 0.1571681946516037,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.2835584580898285,
+      "step": 717
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1129.0,
+      "completions/max_terminated_length": 1129.0,
+      "completions/mean_length": 498.8660888671875,
+      "completions/mean_terminated_length": 498.8660888671875,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.7407789527985555,
+      "grad_norm": 0.8301697969436646,
+      "kl": 0.08447265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0168,
+      "num_tokens": 82899358.0,
+      "reward": 1.3062500953674316,
+      "reward_std": 0.18210452795028687,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30852195620536804,
+      "step": 718
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 839.0,
+      "completions/max_terminated_length": 839.0,
+      "completions/mean_length": 455.982177734375,
+      "completions/mean_terminated_length": 455.982177734375,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 0.7418106783595564,
+      "grad_norm": 0.5934935808181763,
+      "kl": 0.087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0086,
+      "num_tokens": 83013932.0,
+      "reward": 1.2937501668930054,
+      "reward_std": 0.13134068250656128,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.2991824150085449,
+      "step": 719
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 777.0,
+      "completions/max_terminated_length": 777.0,
+      "completions/mean_length": 481.5535888671875,
+      "completions/mean_terminated_length": 481.5535888671875,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "epoch": 0.7428424039205571,
+      "grad_norm": 0.720224142074585,
+      "kl": 0.0858154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0213,
+      "num_tokens": 83130726.0,
+      "reward": 1.171875238418579,
+      "reward_std": 0.1682029515504837,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.2529805302619934,
+      "step": 720
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 651.0,
+      "completions/max_terminated_length": 651.0,
+      "completions/mean_length": 397.5357360839844,
+      "completions/mean_terminated_length": 397.5357360839844,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 0.743874129481558,
+      "grad_norm": 0.6368864178657532,
+      "kl": 0.0858154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0035,
+      "num_tokens": 83241722.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.13773974776268005,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.32933053374290466,
+      "step": 721
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 759.0,
+      "completions/max_terminated_length": 759.0,
+      "completions/mean_length": 437.5535888671875,
+      "completions/mean_terminated_length": 437.5535888671875,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 0.7449058550425587,
+      "grad_norm": 0.8228105306625366,
+      "kl": 0.0982666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 83361302.0,
+      "reward": 1.281250238418579,
+      "reward_std": 0.1560710370540619,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.3145943284034729,
+      "step": 722
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 896.0,
+      "completions/max_terminated_length": 896.0,
+      "completions/mean_length": 472.0714416503906,
+      "completions/mean_terminated_length": 472.0714416503906,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 0.7459375806035594,
+      "grad_norm": 0.7197965383529663,
+      "kl": 0.08154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 83482396.0,
+      "reward": 1.2906252145767212,
+      "reward_std": 0.12630914151668549,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.27349352836608887,
+      "step": 723
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 826.0,
+      "completions/max_terminated_length": 826.0,
+      "completions/mean_length": 475.40179443359375,
+      "completions/mean_terminated_length": 475.40179443359375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.7469693061645603,
+      "grad_norm": 0.6598973870277405,
+      "kl": 0.079833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0206,
+      "num_tokens": 83611927.0,
+      "reward": 1.2906252145767212,
+      "reward_std": 0.1274426281452179,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.29671862721443176,
+      "step": 724
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 792.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 394.26788330078125,
+      "completions/mean_terminated_length": 394.26788330078125,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 0.748001031725561,
+      "grad_norm": 0.6687096357345581,
+      "kl": 0.08544921875,
+      "learning_rate": 1e-06,
+      "loss": 0.007,
+      "num_tokens": 83714619.0,
+      "reward": 1.328125,
+      "reward_std": 0.13225266337394714,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.328125,
+      "rewards/curriculum_aware_reward_fn/std": 0.32980892062187195,
+      "step": 725
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 422.4910888671875,
+      "completions/mean_terminated_length": 422.4910888671875,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 0.7490327572865618,
+      "grad_norm": 0.6657594442367554,
+      "kl": 0.0855712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0197,
+      "num_tokens": 83824105.0,
+      "reward": 1.2531250715255737,
+      "reward_std": 0.08941584080457687,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2707414925098419,
+      "step": 726
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 802.0,
+      "completions/max_terminated_length": 802.0,
+      "completions/mean_length": 463.9464416503906,
+      "completions/mean_terminated_length": 463.9464416503906,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 0.7500644828475626,
+      "grad_norm": 0.6711493134498596,
+      "kl": 0.0826416015625,
+      "learning_rate": 1e-06,
+      "loss": -0.004,
+      "num_tokens": 83943428.0,
+      "reward": 1.2250001430511475,
+      "reward_std": 0.16174761950969696,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.34156501293182373,
+      "step": 727
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 913.0,
+      "completions/max_terminated_length": 913.0,
+      "completions/mean_length": 496.3750305175781,
+      "completions/mean_terminated_length": 496.3750305175781,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.7510962084085633,
+      "grad_norm": 0.7064552903175354,
+      "kl": 0.07373046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0308,
+      "num_tokens": 84066124.0,
+      "reward": 1.171875,
+      "reward_std": 0.1559474766254425,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.2529805302619934,
+      "step": 728
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 698.0,
+      "completions/max_terminated_length": 698.0,
+      "completions/mean_length": 389.96429443359375,
+      "completions/mean_terminated_length": 389.96429443359375,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.7521279339695641,
+      "grad_norm": 0.5838387608528137,
+      "kl": 0.076904296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 84173150.0,
+      "reward": 1.3218752145767212,
+      "reward_std": 0.11716850101947784,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218749463558197,
+      "rewards/curriculum_aware_reward_fn/std": 0.33266472816467285,
+      "step": 729
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 909.0,
+      "completions/max_terminated_length": 909.0,
+      "completions/mean_length": 438.89288330078125,
+      "completions/mean_terminated_length": 438.89288330078125,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 0.7531596595305649,
+      "grad_norm": 0.732061505317688,
+      "kl": 0.072998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0078,
+      "num_tokens": 84285803.0,
+      "reward": 1.278571605682373,
+      "reward_std": 0.18561238050460815,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.30524691939353943,
+      "step": 730
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 707.0,
+      "completions/max_terminated_length": 707.0,
+      "completions/mean_length": 416.6071472167969,
+      "completions/mean_terminated_length": 416.6071472167969,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.7541913850915657,
+      "grad_norm": 0.781933605670929,
+      "kl": 0.075439453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 84391494.0,
+      "reward": 1.1968752145767212,
+      "reward_std": 0.1363893300294876,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2730608284473419,
+      "step": 731
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1764.0,
+      "completions/max_terminated_length": 1764.0,
+      "completions/mean_length": 439.9375305175781,
+      "completions/mean_terminated_length": 439.9375305175781,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.7552231106525664,
+      "grad_norm": 0.5383293628692627,
+      "kl": 0.077880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 84514316.0,
+      "reward": 1.1687500476837158,
+      "reward_std": 0.06947815418243408,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2778719961643219,
+      "step": 732
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1002.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 470.7410888671875,
+      "completions/mean_terminated_length": 470.7410888671875,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 0.7562548362135671,
+      "grad_norm": 0.7183967232704163,
+      "kl": 0.07177734375,
+      "learning_rate": 1e-06,
+      "loss": -0.002,
+      "num_tokens": 84644116.0,
+      "reward": 1.2531250715255737,
+      "reward_std": 0.15776854753494263,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25312501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.29040831327438354,
+      "step": 733
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1185.0,
+      "completions/max_terminated_length": 1185.0,
+      "completions/mean_length": 443.5357360839844,
+      "completions/mean_terminated_length": 443.5357360839844,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.757286561774568,
+      "grad_norm": 0.6907499432563782,
+      "kl": 0.080810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0224,
+      "num_tokens": 84766326.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.12301648408174515,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.24209344387054443,
+      "step": 734
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 883.0,
+      "completions/max_terminated_length": 883.0,
+      "completions/mean_length": 447.9107360839844,
+      "completions/mean_terminated_length": 447.9107360839844,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.7583182873355687,
+      "grad_norm": 0.7715913653373718,
+      "kl": 0.076171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 84888634.0,
+      "reward": 1.1625001430511475,
+      "reward_std": 0.15382583439350128,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.26546746492385864,
+      "step": 735
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 788.0,
+      "completions/max_terminated_length": 788.0,
+      "completions/mean_length": 434.5446472167969,
+      "completions/mean_terminated_length": 434.5446472167969,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.7593500128965696,
+      "grad_norm": 0.7604141235351562,
+      "kl": 0.0784912109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0004,
+      "num_tokens": 85000233.0,
+      "reward": 1.2718751430511475,
+      "reward_std": 0.1417272835969925,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.31060686707496643,
+      "step": 736
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 858.0,
+      "completions/max_terminated_length": 858.0,
+      "completions/mean_length": 448.65179443359375,
+      "completions/mean_terminated_length": 448.65179443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.7603817384575703,
+      "grad_norm": 0.7140162587165833,
+      "kl": 0.083251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 85118886.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.13391801714897156,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.33785226941108704,
+      "step": 737
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 779.0,
+      "completions/max_terminated_length": 779.0,
+      "completions/mean_length": 382.1785888671875,
+      "completions/mean_terminated_length": 382.1785888671875,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 0.761413464018571,
+      "grad_norm": 0.7177646160125732,
+      "kl": 0.0946044921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0002,
+      "num_tokens": 85226286.0,
+      "reward": 1.4000000953674316,
+      "reward_std": 0.12313154339790344,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40000003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.30756235122680664,
+      "step": 738
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1061.0,
+      "completions/max_terminated_length": 1061.0,
+      "completions/mean_length": 481.51788330078125,
+      "completions/mean_terminated_length": 481.51788330078125,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.7624451895795719,
+      "grad_norm": 0.6312434673309326,
+      "kl": 0.0701904296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0171,
+      "num_tokens": 85348158.0,
+      "reward": 1.296875,
+      "reward_std": 0.12531256675720215,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3228031396865845,
+      "step": 739
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 451.58038330078125,
+      "completions/mean_terminated_length": 451.58038330078125,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 0.7634769151405726,
+      "grad_norm": 0.7754645943641663,
+      "kl": 0.083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0074,
+      "num_tokens": 85467622.0,
+      "reward": 1.2125002145767212,
+      "reward_std": 0.19816157221794128,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.279357373714447,
+      "step": 740
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1117.0,
+      "completions/max_terminated_length": 1117.0,
+      "completions/mean_length": 437.4375305175781,
+      "completions/mean_terminated_length": 437.4375305175781,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.7645086407015734,
+      "grad_norm": 0.6591800451278687,
+      "kl": 0.081298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0114,
+      "num_tokens": 85578607.0,
+      "reward": 1.281250238418579,
+      "reward_std": 0.08816681802272797,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3145943284034729,
+      "step": 741
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 832.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 452.51788330078125,
+      "completions/mean_terminated_length": 452.51788330078125,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.7655403662625742,
+      "grad_norm": 0.76875239610672,
+      "kl": 0.08154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0062,
+      "num_tokens": 85692779.0,
+      "reward": 1.2562501430511475,
+      "reward_std": 0.15450942516326904,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2562499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.2933286130428314,
+      "step": 742
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 796.0,
+      "completions/max_terminated_length": 796.0,
+      "completions/mean_length": 399.5982360839844,
+      "completions/mean_terminated_length": 399.5982360839844,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.7665720918235749,
+      "grad_norm": 0.7755102515220642,
+      "kl": 0.0784912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0093,
+      "num_tokens": 85809613.0,
+      "reward": 1.421875238418579,
+      "reward_std": 0.1664501279592514,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.421875,
+      "rewards/curriculum_aware_reward_fn/std": 0.30856987833976746,
+      "step": 743
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1017.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 491.4107360839844,
+      "completions/mean_terminated_length": 491.4107360839844,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.7676038173845757,
+      "grad_norm": 0.7311500310897827,
+      "kl": 0.082763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.021,
+      "num_tokens": 85927112.0,
+      "reward": 1.1906250715255737,
+      "reward_std": 0.16675879061222076,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19062498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.26530036330223083,
+      "step": 744
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 798.0,
+      "completions/max_terminated_length": 798.0,
+      "completions/mean_length": 453.6785888671875,
+      "completions/mean_terminated_length": 453.6785888671875,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.7686355429455765,
+      "grad_norm": 0.7551841139793396,
+      "kl": 0.07421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 86047273.0,
+      "reward": 1.296875238418579,
+      "reward_std": 0.189897820353508,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.28658291697502136,
+      "step": 745
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 777.0,
+      "completions/max_terminated_length": 777.0,
+      "completions/mean_length": 471.8750305175781,
+      "completions/mean_terminated_length": 471.8750305175781,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.7696672685065773,
+      "grad_norm": 0.7156517505645752,
+      "kl": 0.07177734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0135,
+      "num_tokens": 86168023.0,
+      "reward": 1.171875,
+      "reward_std": 0.15609991550445557,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1718749850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.26986658573150635,
+      "step": 746
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 786.0,
+      "completions/max_terminated_length": 786.0,
+      "completions/mean_length": 411.3571472167969,
+      "completions/mean_terminated_length": 411.3571472167969,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.770698994067578,
+      "grad_norm": 0.7727649211883545,
+      "kl": 0.08251953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0199,
+      "num_tokens": 86274617.0,
+      "reward": 1.3437501192092896,
+      "reward_std": 0.2190425992012024,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.32543280720710754,
+      "step": 747
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 766.0,
+      "completions/max_terminated_length": 766.0,
+      "completions/mean_length": 446.3750305175781,
+      "completions/mean_terminated_length": 446.3750305175781,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.7717307196285788,
+      "grad_norm": 0.6120944023132324,
+      "kl": 0.076416015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 86394401.0,
+      "reward": 1.2375000715255737,
+      "reward_std": 0.1284002959728241,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23749998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3016097843647003,
+      "step": 748
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1311.0,
+      "completions/max_terminated_length": 1311.0,
+      "completions/mean_length": 431.33038330078125,
+      "completions/mean_terminated_length": 431.33038330078125,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.7727624451895796,
+      "grad_norm": 0.8004124760627747,
+      "kl": 0.0838623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0162,
+      "num_tokens": 86514502.0,
+      "reward": 1.3375000953674316,
+      "reward_std": 0.19743388891220093,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31138312816619873,
+      "step": 749
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 729.0,
+      "completions/max_terminated_length": 729.0,
+      "completions/mean_length": 432.4732360839844,
+      "completions/mean_terminated_length": 432.4732360839844,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.7737941707505803,
+      "grad_norm": 0.8266004323959351,
+      "kl": 0.08447265625,
+      "learning_rate": 1e-06,
+      "loss": -0.005,
+      "num_tokens": 86630601.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.21234013140201569,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3063907325267792,
+      "step": 750
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 984.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 494.8035888671875,
+      "completions/mean_terminated_length": 494.8035888671875,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.7748258963115812,
+      "grad_norm": 0.7730136513710022,
+      "kl": 0.0733642578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 86746198.0,
+      "reward": 1.2156250476837158,
+      "reward_std": 0.1886139214038849,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.26678189635276794,
+      "step": 751
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 944.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 460.2500305175781,
+      "completions/mean_terminated_length": 460.2500305175781,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.7758576218725819,
+      "grad_norm": 0.7083492279052734,
+      "kl": 0.0791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 86860942.0,
+      "reward": 1.2281250953674316,
+      "reward_std": 0.127069354057312,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.284650981426239,
+      "step": 752
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 780.0,
+      "completions/max_terminated_length": 780.0,
+      "completions/mean_length": 427.7589416503906,
+      "completions/mean_terminated_length": 427.7589416503906,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "epoch": 0.7768893474335826,
+      "grad_norm": 0.8819575905799866,
+      "kl": 0.0924072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 86976975.0,
+      "reward": 1.296875238418579,
+      "reward_std": 0.17696364223957062,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.29791173338890076,
+      "step": 753
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 660.0,
+      "completions/max_terminated_length": 660.0,
+      "completions/mean_length": 411.6339416503906,
+      "completions/mean_terminated_length": 411.6339416503906,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.7779210729945835,
+      "grad_norm": 0.6597311496734619,
+      "kl": 0.079345703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0233,
+      "num_tokens": 87084587.0,
+      "reward": 1.3375002145767212,
+      "reward_std": 0.14886173605918884,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3005625307559967,
+      "step": 754
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 805.0,
+      "completions/max_terminated_length": 805.0,
+      "completions/mean_length": 441.8750305175781,
+      "completions/mean_terminated_length": 441.8750305175781,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.7789527985555842,
+      "grad_norm": 0.7229986786842346,
+      "kl": 0.08251953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0129,
+      "num_tokens": 87201557.0,
+      "reward": 1.4125001430511475,
+      "reward_std": 0.18330040574073792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41249996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.30884116888046265,
+      "step": 755
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 876.0,
+      "completions/max_terminated_length": 876.0,
+      "completions/mean_length": 446.14288330078125,
+      "completions/mean_terminated_length": 446.14288330078125,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.779984524116585,
+      "grad_norm": 0.6569157242774963,
+      "kl": 0.0789794921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0141,
+      "num_tokens": 87312502.0,
+      "reward": 1.3375002145767212,
+      "reward_std": 0.1148693636059761,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3286266624927521,
+      "step": 756
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 690.0,
+      "completions/max_terminated_length": 690.0,
+      "completions/mean_length": 376.6071472167969,
+      "completions/mean_terminated_length": 376.6071472167969,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 0.7810162496775858,
+      "grad_norm": 0.583123505115509,
+      "kl": 0.0899658203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0074,
+      "num_tokens": 87414224.0,
+      "reward": 1.4125001430511475,
+      "reward_std": 0.07358650118112564,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41249996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3295847773551941,
+      "step": 757
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 791.0,
+      "completions/max_terminated_length": 791.0,
+      "completions/mean_length": 426.9910888671875,
+      "completions/mean_terminated_length": 426.9910888671875,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 0.7820479752385865,
+      "grad_norm": 0.8279862999916077,
+      "kl": 0.0753173828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0155,
+      "num_tokens": 87525600.0,
+      "reward": 1.25,
+      "reward_std": 0.1890469193458557,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.302392840385437,
+      "step": 758
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 975.0,
+      "completions/max_terminated_length": 975.0,
+      "completions/mean_length": 459.33929443359375,
+      "completions/mean_terminated_length": 459.33929443359375,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 0.7830797007995873,
+      "grad_norm": 0.5621715188026428,
+      "kl": 0.06854248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 87639556.0,
+      "reward": 1.1500000953674316,
+      "reward_std": 0.11704766750335693,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.24732807278633118,
+      "step": 759
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 738.0,
+      "completions/max_terminated_length": 738.0,
+      "completions/mean_length": 384.9821472167969,
+      "completions/mean_terminated_length": 384.9821472167969,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.7841114263605881,
+      "grad_norm": 0.7570614218711853,
+      "kl": 0.0887451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 87749804.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.1683778166770935,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.330644428730011,
+      "step": 760
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1262.0,
+      "completions/max_terminated_length": 1262.0,
+      "completions/mean_length": 477.77679443359375,
+      "completions/mean_terminated_length": 477.77679443359375,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.7851431519215889,
+      "grad_norm": 0.643505871295929,
+      "kl": 0.0787353515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0008,
+      "num_tokens": 87874260.0,
+      "reward": 1.2937501668930054,
+      "reward_std": 0.15550830960273743,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.32055166363716125,
+      "step": 761
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 990.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 402.0982360839844,
+      "completions/mean_terminated_length": 402.0982360839844,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.7861748774825896,
+      "grad_norm": 0.7015209197998047,
+      "kl": 0.0675048828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 87979810.0,
+      "reward": 1.3062500953674316,
+      "reward_std": 0.1959277242422104,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.32591691613197327,
+      "step": 762
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 717.0,
+      "completions/max_terminated_length": 717.0,
+      "completions/mean_length": 364.08038330078125,
+      "completions/mean_terminated_length": 364.08038330078125,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.7872066030435904,
+      "grad_norm": 0.7301487922668457,
+      "kl": 0.0853271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0033,
+      "num_tokens": 88080185.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.15376612544059753,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.304972380399704,
+      "step": 763
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 373.9196472167969,
+      "completions/mean_terminated_length": 373.9196472167969,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.7882383286045912,
+      "grad_norm": 0.8684947490692139,
+      "kl": 0.0743408203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0194,
+      "num_tokens": 88180136.0,
+      "reward": 1.4125001430511475,
+      "reward_std": 0.14428050816059113,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4124999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.29420071840286255,
+      "step": 764
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1064.0,
+      "completions/max_terminated_length": 1064.0,
+      "completions/mean_length": 463.02679443359375,
+      "completions/mean_terminated_length": 463.02679443359375,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "epoch": 0.7892700541655919,
+      "grad_norm": 0.6717519760131836,
+      "kl": 0.0751953125,
+      "learning_rate": 1e-06,
+      "loss": -0.012,
+      "num_tokens": 88312908.0,
+      "reward": 1.2156251668930054,
+      "reward_std": 0.16076023876667023,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21562497317790985,
+      "rewards/curriculum_aware_reward_fn/std": 0.3017241060733795,
+      "step": 765
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1097.0,
+      "completions/max_terminated_length": 1097.0,
+      "completions/mean_length": 456.77679443359375,
+      "completions/mean_terminated_length": 456.77679443359375,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 0.7903017797265928,
+      "grad_norm": 0.7640579342842102,
+      "kl": 0.0853271484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0133,
+      "num_tokens": 88437665.0,
+      "reward": 1.25,
+      "reward_std": 0.18398447334766388,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2500000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2912384271621704,
+      "step": 766
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 645.0,
+      "completions/max_terminated_length": 645.0,
+      "completions/mean_length": 387.0982360839844,
+      "completions/mean_terminated_length": 387.0982360839844,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.7913335052875935,
+      "grad_norm": 0.7477678060531616,
+      "kl": 0.0906982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0197,
+      "num_tokens": 88554037.0,
+      "reward": 1.2625001668930054,
+      "reward_std": 0.14247135818004608,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26250001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.2989847958087921,
+      "step": 767
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 613.0,
+      "completions/max_terminated_length": 613.0,
+      "completions/mean_length": 370.9464416503906,
+      "completions/mean_terminated_length": 370.9464416503906,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.7923652308485942,
+      "grad_norm": 0.8266369104385376,
+      "kl": 0.086181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 88661068.0,
+      "reward": 1.3660715818405151,
+      "reward_std": 0.18112213909626007,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3141555190086365,
+      "step": 768
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1077.0,
+      "completions/max_terminated_length": 1077.0,
+      "completions/mean_length": 429.1071472167969,
+      "completions/mean_terminated_length": 429.1071472167969,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.7933969564095951,
+      "grad_norm": 0.7582897543907166,
+      "kl": 0.08447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.013,
+      "num_tokens": 88776657.0,
+      "reward": 1.2687500715255737,
+      "reward_std": 0.15974640846252441,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26874998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3080105483531952,
+      "step": 769
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 677.0,
+      "completions/max_terminated_length": 677.0,
+      "completions/mean_length": 389.01788330078125,
+      "completions/mean_terminated_length": 389.01788330078125,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.7944286819705958,
+      "grad_norm": 0.6961264610290527,
+      "kl": 0.0784912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0184,
+      "num_tokens": 88886447.0,
+      "reward": 1.3468753099441528,
+      "reward_std": 0.13390925526618958,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3271692097187042,
+      "step": 770
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 690.0,
+      "completions/max_terminated_length": 690.0,
+      "completions/mean_length": 418.89288330078125,
+      "completions/mean_terminated_length": 418.89288330078125,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.7954604075315966,
+      "grad_norm": 0.7252551317214966,
+      "kl": 0.0838623046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0102,
+      "num_tokens": 89005617.0,
+      "reward": 1.2406251430511475,
+      "reward_std": 0.16366921365261078,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24062499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.31174683570861816,
+      "step": 771
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 772.0,
+      "completions/max_terminated_length": 772.0,
+      "completions/mean_length": 365.5982360839844,
+      "completions/mean_terminated_length": 365.5982360839844,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 0.7964921330925974,
+      "grad_norm": 0.727345883846283,
+      "kl": 0.099365234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0015,
+      "num_tokens": 89113946.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.15670651197433472,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34999996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31515759229660034,
+      "step": 772
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1211.0,
+      "completions/max_terminated_length": 1211.0,
+      "completions/mean_length": 401.76788330078125,
+      "completions/mean_terminated_length": 401.76788330078125,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 0.7975238586535981,
+      "grad_norm": 0.6628202795982361,
+      "kl": 0.0909423828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0239,
+      "num_tokens": 89224758.0,
+      "reward": 1.3441965579986572,
+      "reward_std": 0.1218257024884224,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3062620460987091,
+      "step": 773
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 964.0,
+      "completions/max_terminated_length": 964.0,
+      "completions/mean_length": 452.419677734375,
+      "completions/mean_terminated_length": 452.419677734375,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 0.7985555842145989,
+      "grad_norm": 0.8155584931373596,
+      "kl": 0.0916748046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0088,
+      "num_tokens": 89346934.0,
+      "reward": 1.2562501430511475,
+      "reward_std": 0.1607874482870102,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25624996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.25724852085113525,
+      "step": 774
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 714.0,
+      "completions/max_terminated_length": 714.0,
+      "completions/mean_length": 439.27679443359375,
+      "completions/mean_terminated_length": 439.27679443359375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 0.7995873097755997,
+      "grad_norm": 0.5273690819740295,
+      "kl": 0.0804443359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0031,
+      "num_tokens": 89466476.0,
+      "reward": 1.2250001430511475,
+      "reward_std": 0.0889258086681366,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.2890649735927582,
+      "step": 775
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 799.0,
+      "completions/max_terminated_length": 799.0,
+      "completions/mean_length": 430.2321472167969,
+      "completions/mean_terminated_length": 430.2321472167969,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 0.8006190353366005,
+      "grad_norm": 0.7398731708526611,
+      "kl": 0.0816650390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 89571390.0,
+      "reward": 1.25,
+      "reward_std": 0.1409335881471634,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474,
+      "step": 776
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 638.0,
+      "completions/max_terminated_length": 638.0,
+      "completions/mean_length": 406.5446472167969,
+      "completions/mean_terminated_length": 406.5446472167969,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.8016507608976012,
+      "grad_norm": 0.686924934387207,
+      "kl": 0.0880126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0039,
+      "num_tokens": 89684039.0,
+      "reward": 1.3375002145767212,
+      "reward_std": 0.09972135722637177,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33750003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.2968680262565613,
+      "step": 777
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 618.0,
+      "completions/max_terminated_length": 618.0,
+      "completions/mean_length": 397.0357360839844,
+      "completions/mean_terminated_length": 397.0357360839844,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.802682486458602,
+      "grad_norm": 0.7994930744171143,
+      "kl": 0.0853271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 89792753.0,
+      "reward": 1.3343751430511475,
+      "reward_std": 0.17851689457893372,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.29857251048088074,
+      "step": 778
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 818.0,
+      "completions/max_terminated_length": 818.0,
+      "completions/mean_length": 425.7232360839844,
+      "completions/mean_terminated_length": 425.7232360839844,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.8037142120196028,
+      "grad_norm": 0.7430455684661865,
+      "kl": 0.0855712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 89909216.0,
+      "reward": 1.3218750953674316,
+      "reward_std": 0.1822567731142044,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.32933056354522705,
+      "step": 779
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1038.0,
+      "completions/max_terminated_length": 1038.0,
+      "completions/mean_length": 462.3035888671875,
+      "completions/mean_terminated_length": 462.3035888671875,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.8047459375806035,
+      "grad_norm": 0.7800207138061523,
+      "kl": 0.078125,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 90019633.0,
+      "reward": 1.2218750715255737,
+      "reward_std": 0.13174307346343994,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.27792516350746155,
+      "step": 780
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1368.0,
+      "completions/max_terminated_length": 1368.0,
+      "completions/mean_length": 438.52679443359375,
+      "completions/mean_terminated_length": 438.52679443359375,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 0.8057776631416044,
+      "grad_norm": 0.805196225643158,
+      "kl": 0.0794677734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0377,
+      "num_tokens": 90136032.0,
+      "reward": 1.3218752145767212,
+      "reward_std": 0.2107643485069275,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.31212589144706726,
+      "step": 781
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 996.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 434.64288330078125,
+      "completions/mean_terminated_length": 434.64288330078125,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.8068093887026051,
+      "grad_norm": 0.8555618524551392,
+      "kl": 0.0823974609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0052,
+      "num_tokens": 90252073.0,
+      "reward": 1.303125023841858,
+      "reward_std": 0.154591366648674,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.29896828532218933,
+      "step": 782
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 844.0,
+      "completions/max_terminated_length": 844.0,
+      "completions/mean_length": 454.0000305175781,
+      "completions/mean_terminated_length": 454.0000305175781,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 0.8078411142636058,
+      "grad_norm": 0.7412270307540894,
+      "kl": 0.0733642578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0032,
+      "num_tokens": 90364171.0,
+      "reward": 1.1687500476837158,
+      "reward_std": 0.19139978289604187,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2738715708255768,
+      "step": 783
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1343.0,
+      "completions/max_terminated_length": 1343.0,
+      "completions/mean_length": 429.7500305175781,
+      "completions/mean_terminated_length": 429.7500305175781,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.8088728398246067,
+      "grad_norm": 0.7853583693504333,
+      "kl": 0.0892333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 90487896.0,
+      "reward": 1.2687500715255737,
+      "reward_std": 0.15851913392543793,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26875001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728,
+      "step": 784
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 804.0,
+      "completions/max_terminated_length": 804.0,
+      "completions/mean_length": 440.2232360839844,
+      "completions/mean_terminated_length": 440.2232360839844,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 0.8099045653856074,
+      "grad_norm": 0.7697345614433289,
+      "kl": 0.0906982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 90602819.0,
+      "reward": 1.1812500953674316,
+      "reward_std": 0.17448507249355316,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2738715410232544,
+      "step": 785
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1053.0,
+      "completions/max_terminated_length": 1053.0,
+      "completions/mean_length": 474.6160888671875,
+      "completions/mean_terminated_length": 474.6160888671875,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 0.8109362909466082,
+      "grad_norm": 0.6602099537849426,
+      "kl": 0.091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0265,
+      "num_tokens": 90723512.0,
+      "reward": 1.3375000953674316,
+      "reward_std": 0.1300942748785019,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30781853199005127,
+      "step": 786
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 710.0,
+      "completions/max_terminated_length": 710.0,
+      "completions/mean_length": 425.0446472167969,
+      "completions/mean_terminated_length": 425.0446472167969,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.811968016507609,
+      "grad_norm": 0.6979688405990601,
+      "kl": 0.0908203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0071,
+      "num_tokens": 90836644.0,
+      "reward": 1.281250238418579,
+      "reward_std": 0.11639193445444107,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.31106650829315186,
+      "step": 787
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 481.607177734375,
+      "completions/mean_terminated_length": 481.607177734375,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 0.8129997420686097,
+      "grad_norm": 0.805659830570221,
+      "kl": 0.082763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0219,
+      "num_tokens": 90958932.0,
+      "reward": 1.2093751430511475,
+      "reward_std": 0.15934167802333832,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20937499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.26766687631607056,
+      "step": 788
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 405.33038330078125,
+      "completions/mean_terminated_length": 405.33038330078125,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.8140314676296105,
+      "grad_norm": 0.8081932067871094,
+      "kl": 0.0911865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0063,
+      "num_tokens": 91061057.0,
+      "reward": 1.328125,
+      "reward_std": 0.11970683932304382,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.328125,
+      "rewards/curriculum_aware_reward_fn/std": 0.2944517731666565,
+      "step": 789
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 916.0,
+      "completions/max_terminated_length": 916.0,
+      "completions/mean_length": 427.4910888671875,
+      "completions/mean_terminated_length": 427.4910888671875,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 0.8150631931906113,
+      "grad_norm": 0.835981547832489,
+      "kl": 0.0826416015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 91178985.0,
+      "reward": 1.2937501668930054,
+      "reward_std": 0.14694808423519135,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29374998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3135904371738434,
+      "step": 790
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 819.0,
+      "completions/max_terminated_length": 819.0,
+      "completions/mean_length": 415.3750305175781,
+      "completions/mean_terminated_length": 415.3750305175781,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.8160949187516121,
+      "grad_norm": 0.7383970618247986,
+      "kl": 0.0889892578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0087,
+      "num_tokens": 91285399.0,
+      "reward": 1.28125,
+      "reward_std": 0.16779515147209167,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.3283267021179199,
+      "step": 791
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1140.0,
+      "completions/max_terminated_length": 1140.0,
+      "completions/mean_length": 487.0535888671875,
+      "completions/mean_terminated_length": 487.0535888671875,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.8171266443126128,
+      "grad_norm": 0.683323860168457,
+      "kl": 0.0792236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0219,
+      "num_tokens": 91407149.0,
+      "reward": 1.2718751430511475,
+      "reward_std": 0.09385330229997635,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.2885019779205322,
+      "step": 792
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 986.0,
+      "completions/max_terminated_length": 986.0,
+      "completions/mean_length": 419.1071472167969,
+      "completions/mean_terminated_length": 419.1071472167969,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.8181583698736136,
+      "grad_norm": 0.7385226488113403,
+      "kl": 0.086181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0095,
+      "num_tokens": 91519343.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.155110701918602,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3507733643054962,
+      "step": 793
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 831.0,
+      "completions/max_terminated_length": 831.0,
+      "completions/mean_length": 432.1875305175781,
+      "completions/mean_terminated_length": 432.1875305175781,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.8191900954346144,
+      "grad_norm": 0.8252224326133728,
+      "kl": 0.092041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0237,
+      "num_tokens": 91633702.0,
+      "reward": 1.2625001668930054,
+      "reward_std": 0.1600162237882614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26249998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.2989847660064697,
+      "step": 794
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 438.1696472167969,
+      "completions/mean_terminated_length": 438.1696472167969,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 0.8202218209956151,
+      "grad_norm": 0.7697513103485107,
+      "kl": 0.0823974609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 91738098.0,
+      "reward": 1.328125,
+      "reward_std": 0.15455463528633118,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.328125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3018546998500824,
+      "step": 795
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1126.0,
+      "completions/max_terminated_length": 1126.0,
+      "completions/mean_length": 434.8035888671875,
+      "completions/mean_terminated_length": 434.8035888671875,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.821253546556616,
+      "grad_norm": 0.7645787000656128,
+      "kl": 0.07861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0114,
+      "num_tokens": 91859407.0,
+      "reward": 1.3812501430511475,
+      "reward_std": 0.16925767064094543,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38124996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31005120277404785,
+      "step": 796
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1202.0,
+      "completions/max_terminated_length": 1202.0,
+      "completions/mean_length": 453.232177734375,
+      "completions/mean_terminated_length": 453.232177734375,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.8222852721176167,
+      "grad_norm": 0.7502590417861938,
+      "kl": 0.079833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 91982269.0,
+      "reward": 1.250000238418579,
+      "reward_std": 0.09290025383234024,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.29872098565101624,
+      "step": 797
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 931.0,
+      "completions/max_terminated_length": 931.0,
+      "completions/mean_length": 460.58929443359375,
+      "completions/mean_terminated_length": 460.58929443359375,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.8233169976786174,
+      "grad_norm": 0.7852086424827576,
+      "kl": 0.084228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0219,
+      "num_tokens": 92100922.0,
+      "reward": 1.2281252145767212,
+      "reward_std": 0.1186632364988327,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2560775876045227,
+      "step": 798
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 789.0,
+      "completions/max_terminated_length": 789.0,
+      "completions/mean_length": 433.2946472167969,
+      "completions/mean_terminated_length": 433.2946472167969,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 0.8243487232396183,
+      "grad_norm": 0.7842056155204773,
+      "kl": 0.0833740234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0183,
+      "num_tokens": 92218296.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.2091911882162094,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.31501689553260803,
+      "step": 799
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 665.0,
+      "completions/max_terminated_length": 665.0,
+      "completions/mean_length": 470.8750305175781,
+      "completions/mean_terminated_length": 470.8750305175781,
+      "completions/min_length": 286.0,
+      "completions/min_terminated_length": 286.0,
+      "epoch": 0.825380448800619,
+      "grad_norm": 0.8007763028144836,
+      "kl": 0.0863037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0125,
+      "num_tokens": 92344982.0,
+      "reward": 1.2312501668930054,
+      "reward_std": 0.166080504655838,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23125000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2596884071826935,
+      "step": 800
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1138.0,
+      "completions/max_terminated_length": 1138.0,
+      "completions/mean_length": 463.669677734375,
+      "completions/mean_terminated_length": 463.669677734375,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.8264121743616198,
+      "grad_norm": 0.6105215549468994,
+      "kl": 0.0792236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0339,
+      "num_tokens": 92462860.0,
+      "reward": 1.2156251668930054,
+      "reward_std": 0.14187639951705933,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2789161205291748,
+      "step": 801
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1159.0,
+      "completions/max_terminated_length": 1159.0,
+      "completions/mean_length": 421.8660888671875,
+      "completions/mean_terminated_length": 421.8660888671875,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.8274438999226206,
+      "grad_norm": 0.8359569311141968,
+      "kl": 0.082763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0386,
+      "num_tokens": 92576777.0,
+      "reward": 1.3843752145767212,
+      "reward_std": 0.1463547945022583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3843750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31501689553260803,
+      "step": 802
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1227.0,
+      "completions/max_terminated_length": 1227.0,
+      "completions/mean_length": 462.044677734375,
+      "completions/mean_terminated_length": 462.044677734375,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 0.8284756254836213,
+      "grad_norm": 0.6880461573600769,
+      "kl": 0.084716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0266,
+      "num_tokens": 92697434.0,
+      "reward": 1.3062502145767212,
+      "reward_std": 0.1431155502796173,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.32591691613197327,
+      "step": 803
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 751.0,
+      "completions/max_terminated_length": 751.0,
+      "completions/mean_length": 420.6339416503906,
+      "completions/mean_terminated_length": 420.6339416503906,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.8295073510446221,
+      "grad_norm": 0.7960591912269592,
+      "kl": 0.085205078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0154,
+      "num_tokens": 92811724.0,
+      "reward": 1.3125,
+      "reward_std": 0.1750551015138626,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.33006277680397034,
+      "step": 804
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 847.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 396.5357360839844,
+      "completions/mean_terminated_length": 396.5357360839844,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.8305390766056229,
+      "grad_norm": 0.8146962523460388,
+      "kl": 0.0845947265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0012,
+      "num_tokens": 92921719.0,
+      "reward": 1.3562501668930054,
+      "reward_std": 0.1803237348794937,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.31509506702423096,
+      "step": 805
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 422.3125305175781,
+      "completions/mean_terminated_length": 422.3125305175781,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 0.8315708021666237,
+      "grad_norm": 0.7674344778060913,
+      "kl": 0.0875244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 93030527.0,
+      "reward": 1.2437500953674316,
+      "reward_std": 0.12594076991081238,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24375000596046448,
+      "rewards/curriculum_aware_reward_fn/std": 0.3074982464313507,
+      "step": 806
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 808.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 422.2232360839844,
+      "completions/mean_terminated_length": 422.2232360839844,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.8326025277276244,
+      "grad_norm": 0.862937867641449,
+      "kl": 0.08349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0068,
+      "num_tokens": 93143588.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.18341077864170074,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29764699935913086,
+      "step": 807
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1708.0,
+      "completions/max_terminated_length": 1708.0,
+      "completions/mean_length": 466.4732360839844,
+      "completions/mean_terminated_length": 466.4732360839844,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.8336342532886252,
+      "grad_norm": 0.7200672626495361,
+      "kl": 0.0762939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0187,
+      "num_tokens": 93262186.0,
+      "reward": 1.2281252145767212,
+      "reward_std": 0.11673900485038757,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.27678829431533813,
+      "step": 808
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 696.0,
+      "completions/max_terminated_length": 696.0,
+      "completions/mean_length": 409.6160888671875,
+      "completions/mean_terminated_length": 409.6160888671875,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 0.834665978849626,
+      "grad_norm": 0.8438058495521545,
+      "kl": 0.089599609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0151,
+      "num_tokens": 93372204.0,
+      "reward": 1.2879464626312256,
+      "reward_std": 0.2107357233762741,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.30159345269203186,
+      "step": 809
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 862.0,
+      "completions/max_terminated_length": 862.0,
+      "completions/mean_length": 476.0535888671875,
+      "completions/mean_terminated_length": 476.0535888671875,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "epoch": 0.8356977044106267,
+      "grad_norm": 0.7459707260131836,
+      "kl": 0.0853271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0241,
+      "num_tokens": 93507296.0,
+      "reward": 1.2125002145767212,
+      "reward_std": 0.18899433314800262,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.275378555059433,
+      "step": 810
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1348.0,
+      "completions/max_terminated_length": 1348.0,
+      "completions/mean_length": 473.0357360839844,
+      "completions/mean_terminated_length": 473.0357360839844,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 0.8367294299716276,
+      "grad_norm": 0.7402332425117493,
+      "kl": 0.0797119140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0096,
+      "num_tokens": 93627539.0,
+      "reward": 1.203125,
+      "reward_std": 0.11372973769903183,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.203125,
+      "rewards/curriculum_aware_reward_fn/std": 0.2804662585258484,
+      "step": 811
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 949.0,
+      "completions/max_terminated_length": 949.0,
+      "completions/mean_length": 416.5714416503906,
+      "completions/mean_terminated_length": 416.5714416503906,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.8377611555326283,
+      "grad_norm": 0.723020076751709,
+      "kl": 0.09765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 93743019.0,
+      "reward": 1.146875023841858,
+      "reward_std": 0.12751619517803192,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.23788492381572723,
+      "step": 812
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 819.0,
+      "completions/max_terminated_length": 819.0,
+      "completions/mean_length": 445.107177734375,
+      "completions/mean_terminated_length": 445.107177734375,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.8387928810936292,
+      "grad_norm": 0.6011431813240051,
+      "kl": 0.0804443359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 93863389.0,
+      "reward": 1.171875,
+      "reward_std": 0.1036277711391449,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.27792516350746155,
+      "step": 813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1143.0,
+      "completions/max_terminated_length": 1143.0,
+      "completions/mean_length": 458.169677734375,
+      "completions/mean_terminated_length": 458.169677734375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 0.8398246066546299,
+      "grad_norm": 0.7934638857841492,
+      "kl": 0.08935546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0016,
+      "num_tokens": 93980077.0,
+      "reward": 1.359375238418579,
+      "reward_std": 0.13116928935050964,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.30613335967063904,
+      "step": 814
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 814.0,
+      "completions/max_terminated_length": 814.0,
+      "completions/mean_length": 401.9375305175781,
+      "completions/mean_terminated_length": 401.9375305175781,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 0.8408563322156306,
+      "grad_norm": 0.5980004668235779,
+      "kl": 0.0919189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 94097467.0,
+      "reward": 1.2437502145767212,
+      "reward_std": 0.10399264842271805,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24374999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2773040235042572,
+      "step": 815
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 780.0,
+      "completions/max_terminated_length": 780.0,
+      "completions/mean_length": 403.6160888671875,
+      "completions/mean_terminated_length": 403.6160888671875,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.8418880577766314,
+      "grad_norm": 0.721108078956604,
+      "kl": 0.094482421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0025,
+      "num_tokens": 94199239.0,
+      "reward": 1.3343751430511475,
+      "reward_std": 0.14032889902591705,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3343749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.31998249888420105,
+      "step": 816
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 945.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 431.70538330078125,
+      "completions/mean_terminated_length": 431.70538330078125,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.8429197833376322,
+      "grad_norm": 0.7424548864364624,
+      "kl": 0.085205078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0047,
+      "num_tokens": 94312984.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.11729838699102402,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.29420071840286255,
+      "step": 817
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 848.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 414.5357360839844,
+      "completions/mean_terminated_length": 414.5357360839844,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 0.843951508898633,
+      "grad_norm": 0.7833738327026367,
+      "kl": 0.087646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0015,
+      "num_tokens": 94425142.0,
+      "reward": 1.2000001668930054,
+      "reward_std": 0.16399544477462769,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20000000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.331254780292511,
+      "step": 818
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 677.0,
+      "completions/max_terminated_length": 677.0,
+      "completions/mean_length": 444.669677734375,
+      "completions/mean_terminated_length": 444.669677734375,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.8449832344596337,
+      "grad_norm": 0.7755659222602844,
+      "kl": 0.084228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0107,
+      "num_tokens": 94545204.0,
+      "reward": 1.2531250715255737,
+      "reward_std": 0.174255833029747,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.30882522463798523,
+      "step": 819
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1086.0,
+      "completions/max_terminated_length": 1086.0,
+      "completions/mean_length": 421.6785888671875,
+      "completions/mean_terminated_length": 421.6785888671875,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.8460149600206345,
+      "grad_norm": 0.7222782969474792,
+      "kl": 0.093017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0159,
+      "num_tokens": 94655831.0,
+      "reward": 1.4750001430511475,
+      "reward_std": 0.1430896818637848,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47499996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.28522157669067383,
+      "step": 820
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 814.0,
+      "completions/max_terminated_length": 814.0,
+      "completions/mean_length": 435.1607360839844,
+      "completions/mean_terminated_length": 435.1607360839844,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 0.8470466855816353,
+      "grad_norm": 0.8592573404312134,
+      "kl": 0.0877685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 94771489.0,
+      "reward": 1.371875286102295,
+      "reward_std": 0.1934904009103775,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37187498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3018546998500824,
+      "step": 821
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 672.0,
+      "completions/max_terminated_length": 672.0,
+      "completions/mean_length": 419.0000305175781,
+      "completions/mean_terminated_length": 419.0000305175781,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 0.848078411142636,
+      "grad_norm": 0.71229088306427,
+      "kl": 0.0948486328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0018,
+      "num_tokens": 94882873.0,
+      "reward": 1.4031251668930054,
+      "reward_std": 0.12293804436922073,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.30159345269203186,
+      "step": 822
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1017.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 453.5089416503906,
+      "completions/mean_terminated_length": 453.5089416503906,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 0.8491101367036369,
+      "grad_norm": 0.7214085459709167,
+      "kl": 0.084716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0232,
+      "num_tokens": 95001867.0,
+      "reward": 1.3218752145767212,
+      "reward_std": 0.16592714190483093,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.32255885004997253,
+      "step": 823
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 968.0,
+      "completions/max_terminated_length": 968.0,
+      "completions/mean_length": 413.6607360839844,
+      "completions/mean_terminated_length": 413.6607360839844,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.8501418622646376,
+      "grad_norm": 0.8571239709854126,
+      "kl": 0.0865478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0314,
+      "num_tokens": 95109734.0,
+      "reward": 1.2562501430511475,
+      "reward_std": 0.18357297778129578,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2562499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.2933286130428314,
+      "step": 824
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 785.0,
+      "completions/max_terminated_length": 785.0,
+      "completions/mean_length": 392.1607360839844,
+      "completions/mean_terminated_length": 392.1607360839844,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 0.8511735878256383,
+      "grad_norm": 0.6368698477745056,
+      "kl": 0.0819091796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0053,
+      "num_tokens": 95212813.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.10738946497440338,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3228183686733246,
+      "step": 825
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 773.0,
+      "completions/max_terminated_length": 773.0,
+      "completions/mean_length": 403.5446472167969,
+      "completions/mean_terminated_length": 403.5446472167969,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 0.8522053133866392,
+      "grad_norm": 0.806452214717865,
+      "kl": 0.0963134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0234,
+      "num_tokens": 95314731.0,
+      "reward": 1.3656251430511475,
+      "reward_std": 0.13126225769519806,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3656249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.2910861074924469,
+      "step": 826
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 663.0,
+      "completions/max_terminated_length": 663.0,
+      "completions/mean_length": 421.5089416503906,
+      "completions/mean_terminated_length": 421.5089416503906,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.8532370389476399,
+      "grad_norm": 0.8154585361480713,
+      "kl": 0.08837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0131,
+      "num_tokens": 95432451.0,
+      "reward": 1.1441963911056519,
+      "reward_std": 0.18211181461811066,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.24312911927700043,
+      "step": 827
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 812.0,
+      "completions/max_terminated_length": 812.0,
+      "completions/mean_length": 413.3214416503906,
+      "completions/mean_terminated_length": 413.3214416503906,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.8542687645086408,
+      "grad_norm": 0.8320736885070801,
+      "kl": 0.0955810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0187,
+      "num_tokens": 95549072.0,
+      "reward": 1.3093750476837158,
+      "reward_std": 0.13835106790065765,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3177575469017029,
+      "step": 828
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 753.0,
+      "completions/max_terminated_length": 753.0,
+      "completions/mean_length": 370.8125305175781,
+      "completions/mean_terminated_length": 370.8125305175781,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.8553004900696415,
+      "grad_norm": 0.8292173147201538,
+      "kl": 0.1004638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0045,
+      "num_tokens": 95652238.0,
+      "reward": 1.3906251192092896,
+      "reward_std": 0.1790168732404709,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.390625,
+      "rewards/curriculum_aware_reward_fn/std": 0.2998897135257721,
+      "step": 829
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 447.2232360839844,
+      "completions/mean_terminated_length": 447.2232360839844,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 0.8563322156306422,
+      "grad_norm": 0.7676867246627808,
+      "kl": 0.0826416015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0195,
+      "num_tokens": 95772187.0,
+      "reward": 1.334375023841858,
+      "reward_std": 0.1484861969947815,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3094627261161804,
+      "step": 830
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 713.0,
+      "completions/max_terminated_length": 713.0,
+      "completions/mean_length": 390.76788330078125,
+      "completions/mean_terminated_length": 390.76788330078125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.857363941191643,
+      "grad_norm": 0.8843021988868713,
+      "kl": 0.09521484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0165,
+      "num_tokens": 95882726.0,
+      "reward": 1.2906250953674316,
+      "reward_std": 0.1612338423728943,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552,
+      "step": 831
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 701.0,
+      "completions/max_terminated_length": 701.0,
+      "completions/mean_length": 390.39288330078125,
+      "completions/mean_terminated_length": 390.39288330078125,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.8583956667526438,
+      "grad_norm": 0.7326264977455139,
+      "kl": 0.09228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0043,
+      "num_tokens": 95989394.0,
+      "reward": 1.2250001430511475,
+      "reward_std": 0.11293934285640717,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.30030015110969543,
+      "step": 832
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 740.0,
+      "completions/max_terminated_length": 740.0,
+      "completions/mean_length": 385.40179443359375,
+      "completions/mean_terminated_length": 385.40179443359375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.8594273923136446,
+      "grad_norm": 0.9363760352134705,
+      "kl": 0.17724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.006,
+      "num_tokens": 96104561.0,
+      "reward": 1.3406251668930054,
+      "reward_std": 0.09900873154401779,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3097173273563385,
+      "step": 833
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 767.0,
+      "completions/max_terminated_length": 767.0,
+      "completions/mean_length": 398.76788330078125,
+      "completions/mean_terminated_length": 398.76788330078125,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.8604591178746454,
+      "grad_norm": 0.5181918144226074,
+      "kl": 0.0869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 96220532.0,
+      "reward": 1.328125238418579,
+      "reward_std": 0.07820750772953033,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.328125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3331383168697357,
+      "step": 834
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 747.0,
+      "completions/max_terminated_length": 747.0,
+      "completions/mean_length": 387.26788330078125,
+      "completions/mean_terminated_length": 387.26788330078125,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.8614908434356461,
+      "grad_norm": 0.5764657258987427,
+      "kl": 0.09765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0065,
+      "num_tokens": 96327557.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.0906006395816803,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.3371369540691376,
+      "step": 835
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 808.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 387.6071472167969,
+      "completions/mean_terminated_length": 387.6071472167969,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.8625225689966469,
+      "grad_norm": 0.834621787071228,
+      "kl": 0.088623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0322,
+      "num_tokens": 96442887.0,
+      "reward": 1.3312500715255737,
+      "reward_std": 0.15943463146686554,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33125001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.32832667231559753,
+      "step": 836
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 718.0,
+      "completions/max_terminated_length": 718.0,
+      "completions/mean_length": 403.1250305175781,
+      "completions/mean_terminated_length": 403.1250305175781,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.8635542945576477,
+      "grad_norm": 0.7438127398490906,
+      "kl": 0.0938720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 96552269.0,
+      "reward": 1.2535713911056519,
+      "reward_std": 0.12903402745723724,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26249998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.2989847660064697,
+      "step": 837
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 777.0,
+      "completions/max_terminated_length": 777.0,
+      "completions/mean_length": 395.5625305175781,
+      "completions/mean_terminated_length": 395.5625305175781,
+      "completions/min_length": 145.0,
+      "completions/min_terminated_length": 145.0,
+      "epoch": 0.8645860201186485,
+      "grad_norm": 0.799825131893158,
+      "kl": 0.088623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 96662778.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.1471077799797058,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3176490366458893,
+      "step": 838
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1083.0,
+      "completions/max_terminated_length": 1083.0,
+      "completions/mean_length": 399.89288330078125,
+      "completions/mean_terminated_length": 399.89288330078125,
+      "completions/min_length": 145.0,
+      "completions/min_terminated_length": 145.0,
+      "epoch": 0.8656177456796492,
+      "grad_norm": 0.7156750559806824,
+      "kl": 0.08203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0155,
+      "num_tokens": 96775018.0,
+      "reward": 1.3437501192092896,
+      "reward_std": 0.10255005210638046,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728,
+      "step": 839
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 804.0,
+      "completions/max_terminated_length": 804.0,
+      "completions/mean_length": 386.9464416503906,
+      "completions/mean_terminated_length": 386.9464416503906,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.86664947124065,
+      "grad_norm": 0.7068809270858765,
+      "kl": 0.104736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0071,
+      "num_tokens": 96887159.0,
+      "reward": 1.312500238418579,
+      "reward_std": 0.15039849281311035,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.33338961005210876,
+      "step": 840
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 688.0,
+      "completions/max_terminated_length": 688.0,
+      "completions/mean_length": 395.02679443359375,
+      "completions/mean_terminated_length": 395.02679443359375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.8676811968016508,
+      "grad_norm": 0.6292682886123657,
+      "kl": 0.0875244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 96993506.0,
+      "reward": 1.3656251430511475,
+      "reward_std": 0.09355327486991882,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3656249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.4079188406467438,
+      "step": 841
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 669.0,
+      "completions/max_terminated_length": 669.0,
+      "completions/mean_length": 374.5357360839844,
+      "completions/mean_terminated_length": 374.5357360839844,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.8687129223626515,
+      "grad_norm": 0.7815769910812378,
+      "kl": 0.096435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0273,
+      "num_tokens": 97096151.0,
+      "reward": 1.3125,
+      "reward_std": 0.12005040049552917,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3057629466056824,
+      "step": 842
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 722.0,
+      "completions/max_terminated_length": 722.0,
+      "completions/mean_length": 356.83038330078125,
+      "completions/mean_terminated_length": 356.83038330078125,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.8697446479236524,
+      "grad_norm": 0.9005613327026367,
+      "kl": 0.0909423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0012,
+      "num_tokens": 97200201.0,
+      "reward": 1.3375000953674316,
+      "reward_std": 0.1584685891866684,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30421215295791626,
+      "step": 843
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 673.0,
+      "completions/max_terminated_length": 673.0,
+      "completions/mean_length": 375.0714416503906,
+      "completions/mean_terminated_length": 375.0714416503906,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.8707763734846531,
+      "grad_norm": 0.7789735198020935,
+      "kl": 0.0931396484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0192,
+      "num_tokens": 97310053.0,
+      "reward": 1.3406250476837158,
+      "reward_std": 0.1421245038509369,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.31326034665107727,
+      "step": 844
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1432.0,
+      "completions/max_terminated_length": 1432.0,
+      "completions/mean_length": 381.8035888671875,
+      "completions/mean_terminated_length": 381.8035888671875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 0.8718080990456538,
+      "grad_norm": 0.8505178689956665,
+      "kl": 0.0975341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 97419805.0,
+      "reward": 1.3937500715255737,
+      "reward_std": 0.1500549167394638,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.31907275319099426,
+      "step": 845
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1939.0,
+      "completions/max_terminated_length": 1939.0,
+      "completions/mean_length": 362.9285888671875,
+      "completions/mean_terminated_length": 362.9285888671875,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 0.8728398246066547,
+      "grad_norm": 0.6682746410369873,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 97517636.0,
+      "reward": 1.3687502145767212,
+      "reward_std": 0.09541250765323639,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3687500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.32832667231559753,
+      "step": 846
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 689.0,
+      "completions/max_terminated_length": 689.0,
+      "completions/mean_length": 399.77679443359375,
+      "completions/mean_terminated_length": 399.77679443359375,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.8738715501676554,
+      "grad_norm": 0.6375681161880493,
+      "kl": 0.091552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 97634091.0,
+      "reward": 1.3562501668930054,
+      "reward_std": 0.1318189799785614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3354521691799164,
+      "step": 847
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 751.0,
+      "completions/max_terminated_length": 751.0,
+      "completions/mean_length": 381.2410888671875,
+      "completions/mean_terminated_length": 381.2410888671875,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 0.8749032757286562,
+      "grad_norm": 0.7441376447677612,
+      "kl": 0.10009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0073,
+      "num_tokens": 97744452.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.1355905532836914,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378,
+      "step": 848
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2580.0,
+      "completions/max_terminated_length": 2580.0,
+      "completions/mean_length": 430.45538330078125,
+      "completions/mean_terminated_length": 430.45538330078125,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 0.875935001289657,
+      "grad_norm": 0.693661630153656,
+      "kl": 0.0860595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0138,
+      "num_tokens": 97856888.0,
+      "reward": 1.3218752145767212,
+      "reward_std": 0.14773008227348328,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218749463558197,
+      "rewards/curriculum_aware_reward_fn/std": 0.29013675451278687,
+      "step": 849
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 997.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 405.1250305175781,
+      "completions/mean_terminated_length": 405.1250305175781,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.8769667268506577,
+      "grad_norm": 0.7815752029418945,
+      "kl": 0.080078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0006,
+      "num_tokens": 97973270.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.15360049903392792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34999996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31515759229660034,
+      "step": 850
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 877.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 383.20538330078125,
+      "completions/mean_terminated_length": 383.20538330078125,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.8779984524116585,
+      "grad_norm": 0.7044087648391724,
+      "kl": 0.0887451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0195,
+      "num_tokens": 98082479.0,
+      "reward": 1.28125,
+      "reward_std": 0.1456676721572876,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28125,
+      "rewards/curriculum_aware_reward_fn/std": 0.2965359091758728,
+      "step": 851
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 931.0,
+      "completions/max_terminated_length": 931.0,
+      "completions/mean_length": 441.0000305175781,
+      "completions/mean_terminated_length": 441.0000305175781,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.8790301779726593,
+      "grad_norm": 0.8918853402137756,
+      "kl": 0.0916748046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 98202249.0,
+      "reward": 1.2906252145767212,
+      "reward_std": 0.1657445877790451,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.29297566413879395,
+      "step": 852
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 811.0,
+      "completions/max_terminated_length": 811.0,
+      "completions/mean_length": 381.0000305175781,
+      "completions/mean_terminated_length": 381.0000305175781,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.8800619035336601,
+      "grad_norm": 0.8854279518127441,
+      "kl": 0.0831298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 98316314.0,
+      "reward": 1.2660716772079468,
+      "reward_std": 0.22139222919940948,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30602067708969116,
+      "step": 853
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 767.0,
+      "completions/max_terminated_length": 767.0,
+      "completions/mean_length": 411.7410888671875,
+      "completions/mean_terminated_length": 411.7410888671875,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.8810936290946608,
+      "grad_norm": 0.8486577868461609,
+      "kl": 0.0880126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.013,
+      "num_tokens": 98433899.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.175072580575943,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.29896828532218933,
+      "step": 854
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 829.0,
+      "completions/max_terminated_length": 829.0,
+      "completions/mean_length": 402.1339416503906,
+      "completions/mean_terminated_length": 402.1339416503906,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.8821253546556616,
+      "grad_norm": 0.8045067191123962,
+      "kl": 0.0709228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0155,
+      "num_tokens": 98546075.0,
+      "reward": 1.3437501192092896,
+      "reward_std": 0.16631193459033966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728,
+      "step": 855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 685.0,
+      "completions/max_terminated_length": 685.0,
+      "completions/mean_length": 379.7410888671875,
+      "completions/mean_terminated_length": 379.7410888671875,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.8831570802166624,
+      "grad_norm": 0.8452091813087463,
+      "kl": 0.0853271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 98658711.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.17389629781246185,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.30704930424690247,
+      "step": 856
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 889.0,
+      "completions/max_terminated_length": 889.0,
+      "completions/mean_length": 417.1696472167969,
+      "completions/mean_terminated_length": 417.1696472167969,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 0.8841888057776631,
+      "grad_norm": 0.83913254737854,
+      "kl": 0.0849609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0166,
+      "num_tokens": 98775763.0,
+      "reward": 1.234375238418579,
+      "reward_std": 0.18759752810001373,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.234375,
+      "rewards/curriculum_aware_reward_fn/std": 0.267372190952301,
+      "step": 857
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1142.0,
+      "completions/max_terminated_length": 1142.0,
+      "completions/mean_length": 358.6339416503906,
+      "completions/mean_terminated_length": 358.6339416503906,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 0.885220531338664,
+      "grad_norm": 0.6839672923088074,
+      "kl": 0.0914306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 98871601.0,
+      "reward": 1.4375001192092896,
+      "reward_std": 0.09861234575510025,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4375,
+      "rewards/curriculum_aware_reward_fn/std": 0.31340184807777405,
+      "step": 858
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 677.0,
+      "completions/max_terminated_length": 677.0,
+      "completions/mean_length": 387.8750305175781,
+      "completions/mean_terminated_length": 387.8750305175781,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.8862522568996647,
+      "grad_norm": 0.7101348638534546,
+      "kl": 0.0872802734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 98979394.0,
+      "reward": 1.1968750953674316,
+      "reward_std": 0.13633039593696594,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.25204402208328247,
+      "step": 859
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 784.0,
+      "completions/max_terminated_length": 784.0,
+      "completions/mean_length": 367.7857360839844,
+      "completions/mean_terminated_length": 367.7857360839844,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.8872839824606654,
+      "grad_norm": 0.7461708188056946,
+      "kl": 0.0897216796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0021,
+      "num_tokens": 99089417.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.12355701625347137,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.32055166363716125,
+      "step": 860
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 833.0,
+      "completions/max_terminated_length": 833.0,
+      "completions/mean_length": 379.52679443359375,
+      "completions/mean_terminated_length": 379.52679443359375,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "epoch": 0.8883157080216663,
+      "grad_norm": 0.7257040739059448,
+      "kl": 0.090087890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0127,
+      "num_tokens": 99192817.0,
+      "reward": 1.3656251430511475,
+      "reward_std": 0.14645807445049286,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36562496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31998246908187866,
+      "step": 861
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 662.0,
+      "completions/max_terminated_length": 662.0,
+      "completions/mean_length": 333.8571472167969,
+      "completions/mean_terminated_length": 333.8571472167969,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.889347433582667,
+      "grad_norm": 0.8632603883743286,
+      "kl": 0.104736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0267,
+      "num_tokens": 99286755.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.1209789365530014,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3156418800354004,
+      "step": 862
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 759.0,
+      "completions/max_terminated_length": 759.0,
+      "completions/mean_length": 371.9375305175781,
+      "completions/mean_terminated_length": 371.9375305175781,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 0.8903791591436678,
+      "grad_norm": 0.804155170917511,
+      "kl": 0.1024169921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0062,
+      "num_tokens": 99410704.0,
+      "reward": 1.28125,
+      "reward_std": 0.1209929883480072,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28125,
+      "rewards/curriculum_aware_reward_fn/std": 0.34472373127937317,
+      "step": 863
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 731.0,
+      "completions/max_terminated_length": 731.0,
+      "completions/mean_length": 418.89288330078125,
+      "completions/mean_terminated_length": 418.89288330078125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.8914108847046686,
+      "grad_norm": 0.6388071775436401,
+      "kl": 0.07958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 99527053.0,
+      "reward": 1.2250001430511475,
+      "reward_std": 0.08316321671009064,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.29285791516304016,
+      "step": 864
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 731.0,
+      "completions/max_terminated_length": 731.0,
+      "completions/mean_length": 352.4732360839844,
+      "completions/mean_terminated_length": 352.4732360839844,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.8924426102656693,
+      "grad_norm": 0.7764028310775757,
+      "kl": 0.096923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0175,
+      "num_tokens": 99630426.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.12856173515319824,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2704501748085022,
+      "step": 865
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 959.0,
+      "completions/max_terminated_length": 959.0,
+      "completions/mean_length": 360.95538330078125,
+      "completions/mean_terminated_length": 360.95538330078125,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 0.8934743358266701,
+      "grad_norm": 0.8951486945152283,
+      "kl": 0.1124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0125,
+      "num_tokens": 99738494.0,
+      "reward": 1.2281250953674316,
+      "reward_std": 0.16628824174404144,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2885020077228546,
+      "step": 866
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 646.0,
+      "completions/max_terminated_length": 646.0,
+      "completions/mean_length": 368.65179443359375,
+      "completions/mean_terminated_length": 368.65179443359375,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.8945060613876709,
+      "grad_norm": 0.8461310267448425,
+      "kl": 0.0941162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 99844967.0,
+      "reward": 1.3312500715255737,
+      "reward_std": 0.13890205323696136,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.28899678587913513,
+      "step": 867
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 576.0,
+      "completions/max_terminated_length": 576.0,
+      "completions/mean_length": 334.75,
+      "completions/mean_terminated_length": 334.75,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 0.8955377869486717,
+      "grad_norm": 0.9520079493522644,
+      "kl": 0.1060791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0101,
+      "num_tokens": 99947860.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.1789051741361618,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.31163617968559265,
+      "step": 868
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 903.0,
+      "completions/max_terminated_length": 903.0,
+      "completions/mean_length": 367.9196472167969,
+      "completions/mean_terminated_length": 367.9196472167969,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.8965695125096724,
+      "grad_norm": 0.9064900279045105,
+      "kl": 0.10302734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0049,
+      "num_tokens": 100056708.0,
+      "reward": 1.3218750953674316,
+      "reward_std": 0.17131094634532928,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218749463558197,
+      "rewards/curriculum_aware_reward_fn/std": 0.3878086507320404,
+      "step": 869
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 620.0,
+      "completions/max_terminated_length": 620.0,
+      "completions/mean_length": 334.40179443359375,
+      "completions/mean_terminated_length": 334.40179443359375,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.8976012380706732,
+      "grad_norm": 1.0107978582382202,
+      "kl": 0.1094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0049,
+      "num_tokens": 100164441.0,
+      "reward": 1.3843752145767212,
+      "reward_std": 0.218043714761734,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3843750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31850093603134155,
+      "step": 870
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 879.0,
+      "completions/max_terminated_length": 879.0,
+      "completions/mean_length": 358.64288330078125,
+      "completions/mean_terminated_length": 358.64288330078125,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 0.898632963631674,
+      "grad_norm": 0.8547470569610596,
+      "kl": 0.1004638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0047,
+      "num_tokens": 100266412.0,
+      "reward": 1.3375000953674316,
+      "reward_std": 0.17571650445461273,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31839263439178467,
+      "step": 871
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 695.0,
+      "completions/max_terminated_length": 695.0,
+      "completions/mean_length": 330.8660888671875,
+      "completions/mean_terminated_length": 330.8660888671875,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 0.8996646891926747,
+      "grad_norm": 0.8031458258628845,
+      "kl": 0.1195068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 100361142.0,
+      "reward": 1.4312502145767212,
+      "reward_std": 0.12636539340019226,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4312500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.2933286130428314,
+      "step": 872
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 980.0,
+      "completions/max_terminated_length": 980.0,
+      "completions/mean_length": 397.1964416503906,
+      "completions/mean_terminated_length": 397.1964416503906,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.9006964147536756,
+      "grad_norm": 0.881301999092102,
+      "kl": 0.0902099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 100477924.0,
+      "reward": 1.21875,
+      "reward_std": 0.171630397439003,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21875,
+      "rewards/curriculum_aware_reward_fn/std": 0.2938655614852905,
+      "step": 873
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 871.0,
+      "completions/max_terminated_length": 871.0,
+      "completions/mean_length": 369.45538330078125,
+      "completions/mean_terminated_length": 369.45538330078125,
+      "completions/min_length": 122.0,
+      "completions/min_terminated_length": 122.0,
+      "epoch": 0.9017281403146763,
+      "grad_norm": 0.660327672958374,
+      "kl": 0.0931396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 100580651.0,
+      "reward": 1.3968751430511475,
+      "reward_std": 0.09679971635341644,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39687496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.30263715982437134,
+      "step": 874
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 937.0,
+      "completions/max_terminated_length": 937.0,
+      "completions/mean_length": 445.89288330078125,
+      "completions/mean_terminated_length": 445.89288330078125,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.902759865875677,
+      "grad_norm": 0.8164187669754028,
+      "kl": 0.09228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0287,
+      "num_tokens": 100704889.0,
+      "reward": 1.2750000953674316,
+      "reward_std": 0.15713198482990265,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.32012102007865906,
+      "step": 875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 784.0,
+      "completions/max_terminated_length": 784.0,
+      "completions/mean_length": 395.4821472167969,
+      "completions/mean_terminated_length": 395.4821472167969,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 0.9037915914366779,
+      "grad_norm": 0.891213059425354,
+      "kl": 0.0943603515625,
+      "learning_rate": 1e-06,
+      "loss": -0.012,
+      "num_tokens": 100811588.0,
+      "reward": 1.3531250953674316,
+      "reward_std": 0.2118910700082779,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3743632137775421,
+      "step": 876
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 858.0,
+      "completions/max_terminated_length": 858.0,
+      "completions/mean_length": 364.4464416503906,
+      "completions/mean_terminated_length": 364.4464416503906,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.9048233169976786,
+      "grad_norm": 0.8460397720336914,
+      "kl": 0.10400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 100917464.0,
+      "reward": 1.2062500715255737,
+      "reward_std": 0.12978744506835938,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20624999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.27616459131240845,
+      "step": 877
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 851.0,
+      "completions/max_terminated_length": 851.0,
+      "completions/mean_length": 361.8035888671875,
+      "completions/mean_terminated_length": 361.8035888671875,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.9058550425586794,
+      "grad_norm": 0.8688164353370667,
+      "kl": 0.1024169921875,
+      "learning_rate": 1e-06,
+      "loss": -0.002,
+      "num_tokens": 101019390.0,
+      "reward": 1.375000238418579,
+      "reward_std": 0.1342623233795166,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3176490068435669,
+      "step": 878
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1071.0,
+      "completions/max_terminated_length": 1071.0,
+      "completions/mean_length": 432.39288330078125,
+      "completions/mean_terminated_length": 432.39288330078125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 0.9068867681196802,
+      "grad_norm": 0.7971656322479248,
+      "kl": 0.083740234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0371,
+      "num_tokens": 101136012.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.14622807502746582,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.30067726969718933,
+      "step": 879
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 809.0,
+      "completions/max_terminated_length": 809.0,
+      "completions/mean_length": 415.0982360839844,
+      "completions/mean_terminated_length": 415.0982360839844,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.9079184936806809,
+      "grad_norm": 0.6433001160621643,
+      "kl": 0.08544921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0238,
+      "num_tokens": 101242500.0,
+      "reward": 1.3406251668930054,
+      "reward_std": 0.09915289282798767,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.33040592074394226,
+      "step": 880
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 427.8750305175781,
+      "completions/mean_terminated_length": 427.8750305175781,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 0.9089502192416817,
+      "grad_norm": 0.7412571907043457,
+      "kl": 0.0904541015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0097,
+      "num_tokens": 101354430.0,
+      "reward": 1.28125,
+      "reward_std": 0.13072429597377777,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28125,
+      "rewards/curriculum_aware_reward_fn/std": 0.2965359091758728,
+      "step": 881
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 860.0,
+      "completions/max_terminated_length": 860.0,
+      "completions/mean_length": 390.58929443359375,
+      "completions/mean_terminated_length": 390.58929443359375,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 0.9099819448026825,
+      "grad_norm": 0.9273052215576172,
+      "kl": 0.1094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 101464879.0,
+      "reward": 1.2660716772079468,
+      "reward_std": 0.15056000649929047,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474,
+      "step": 882
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 610.0,
+      "completions/max_terminated_length": 610.0,
+      "completions/mean_length": 402.2946472167969,
+      "completions/mean_terminated_length": 402.2946472167969,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.9110136703636833,
+      "grad_norm": 0.8218299746513367,
+      "kl": 0.090087890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0154,
+      "num_tokens": 101572901.0,
+      "reward": 1.2718751430511475,
+      "reward_std": 0.14793546497821808,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.284650981426239,
+      "step": 883
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 737.0,
+      "completions/max_terminated_length": 737.0,
+      "completions/mean_length": 398.83929443359375,
+      "completions/mean_terminated_length": 398.83929443359375,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 0.912045395924684,
+      "grad_norm": 0.9065353274345398,
+      "kl": 0.1005859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0066,
+      "num_tokens": 101675615.0,
+      "reward": 1.3062500953674316,
+      "reward_std": 0.19413165748119354,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062499463558197,
+      "rewards/curriculum_aware_reward_fn/std": 0.2900858223438263,
+      "step": 884
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 831.0,
+      "completions/max_terminated_length": 831.0,
+      "completions/mean_length": 421.6339416503906,
+      "completions/mean_terminated_length": 421.6339416503906,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "epoch": 0.9130771214856848,
+      "grad_norm": 0.6250420212745667,
+      "kl": 0.0860595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0224,
+      "num_tokens": 101783689.0,
+      "reward": 1.3066965341567993,
+      "reward_std": 0.10317334532737732,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3253571093082428,
+      "step": 885
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 824.0,
+      "completions/max_terminated_length": 824.0,
+      "completions/mean_length": 431.5982360839844,
+      "completions/mean_terminated_length": 431.5982360839844,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 0.9141088470466856,
+      "grad_norm": 0.8888685703277588,
+      "kl": 0.1077880859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 101901874.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.15322957932949066,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.29525384306907654,
+      "step": 886
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 420.5714416503906,
+      "completions/mean_terminated_length": 420.5714416503906,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.9151405726076863,
+      "grad_norm": 0.7968472242355347,
+      "kl": 0.09375,
+      "learning_rate": 1e-06,
+      "loss": -0.0153,
+      "num_tokens": 102019160.0,
+      "reward": 1.3218750953674316,
+      "reward_std": 0.13992083072662354,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30133193731307983,
+      "step": 887
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 666.0,
+      "completions/max_terminated_length": 666.0,
+      "completions/mean_length": 429.77679443359375,
+      "completions/mean_terminated_length": 429.77679443359375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.9161722981686872,
+      "grad_norm": 0.6858448386192322,
+      "kl": 0.1043701171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 102129896.0,
+      "reward": 1.303125023841858,
+      "reward_std": 0.12142420560121536,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378,
+      "step": 888
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1049.0,
+      "completions/max_terminated_length": 1049.0,
+      "completions/mean_length": 464.20538330078125,
+      "completions/mean_terminated_length": 464.20538330078125,
+      "completions/min_length": 120.0,
+      "completions/min_terminated_length": 120.0,
+      "epoch": 0.9172040237296879,
+      "grad_norm": 0.6691279411315918,
+      "kl": 0.083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0139,
+      "num_tokens": 102244341.0,
+      "reward": 1.296875,
+      "reward_std": 0.13214033842086792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3123783469200134,
+      "step": 889
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1458.0,
+      "completions/max_terminated_length": 1458.0,
+      "completions/mean_length": 528.2232666015625,
+      "completions/mean_terminated_length": 528.2232666015625,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 0.9182357492906886,
+      "grad_norm": 0.8207437992095947,
+      "kl": 0.074462890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0118,
+      "num_tokens": 102366047.0,
+      "reward": 1.1968750953674316,
+      "reward_std": 0.16041667759418488,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2849277853965759,
+      "step": 890
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 892.0,
+      "completions/max_terminated_length": 892.0,
+      "completions/mean_length": 457.3482360839844,
+      "completions/mean_terminated_length": 457.3482360839844,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 0.9192674748516895,
+      "grad_norm": 0.7058476805686951,
+      "kl": 0.1005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0049,
+      "num_tokens": 102486990.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.14487729966640472,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.40609100461006165,
+      "step": 891
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1023.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 471.90179443359375,
+      "completions/mean_terminated_length": 471.90179443359375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 0.9202992004126902,
+      "grad_norm": 0.6852114796638489,
+      "kl": 0.0927734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0006,
+      "num_tokens": 102610319.0,
+      "reward": 1.2535713911056519,
+      "reward_std": 0.11259433627128601,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26249998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.28383633494377136,
+      "step": 892
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1055.0,
+      "completions/max_terminated_length": 1055.0,
+      "completions/mean_length": 482.857177734375,
+      "completions/mean_terminated_length": 482.857177734375,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.921330925973691,
+      "grad_norm": 0.7629786133766174,
+      "kl": 0.086181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0122,
+      "num_tokens": 102730402.0,
+      "reward": 1.3129466772079468,
+      "reward_std": 0.2128395140171051,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.31911906599998474,
+      "step": 893
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 489.0982360839844,
+      "completions/mean_terminated_length": 489.0982360839844,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.9223626515346918,
+      "grad_norm": 0.6978335380554199,
+      "kl": 0.08154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.011,
+      "num_tokens": 102848804.0,
+      "reward": 1.2946430444717407,
+      "reward_std": 0.21274369955062866,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.31987470388412476,
+      "step": 894
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1120.0,
+      "completions/max_terminated_length": 1120.0,
+      "completions/mean_length": 521.3125,
+      "completions/mean_terminated_length": 521.3125,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.9233943770956925,
+      "grad_norm": 0.7006912231445312,
+      "kl": 0.08349609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0092,
+      "num_tokens": 102982035.0,
+      "reward": 1.240625023841858,
+      "reward_std": 0.13423092663288116,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24062497913837433,
+      "rewards/curriculum_aware_reward_fn/std": 0.2780669629573822,
+      "step": 895
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1302.0,
+      "completions/max_terminated_length": 1302.0,
+      "completions/mean_length": 473.90179443359375,
+      "completions/mean_terminated_length": 473.90179443359375,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.9244261026566933,
+      "grad_norm": 0.8263719081878662,
+      "kl": 0.0882568359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 103108528.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.17819008231163025,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.2914920449256897,
+      "step": 896
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 721.0,
+      "completions/max_terminated_length": 721.0,
+      "completions/mean_length": 411.8125305175781,
+      "completions/mean_terminated_length": 411.8125305175781,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.9254578282176941,
+      "grad_norm": 0.5894196629524231,
+      "kl": 0.10205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 103214179.0,
+      "reward": 1.4312502145767212,
+      "reward_std": 0.10276122391223907,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4312500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31509506702423096,
+      "step": 897
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 440.5535888671875,
+      "completions/mean_terminated_length": 440.5535888671875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 0.9264895537786949,
+      "grad_norm": 0.6899087429046631,
+      "kl": 0.0850830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 103324337.0,
+      "reward": 1.371875286102295,
+      "reward_std": 0.1472577452659607,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37187501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.29817622900009155,
+      "step": 898
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1060.0,
+      "completions/max_terminated_length": 1060.0,
+      "completions/mean_length": 473.1607360839844,
+      "completions/mean_terminated_length": 473.1607360839844,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.9275212793396956,
+      "grad_norm": 0.6388729214668274,
+      "kl": 0.0850830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0178,
+      "num_tokens": 103442473.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.13890205323696136,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.31501689553260803,
+      "step": 899
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1217.0,
+      "completions/max_terminated_length": 1217.0,
+      "completions/mean_length": 504.4910888671875,
+      "completions/mean_terminated_length": 504.4910888671875,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 0.9285530049006964,
+      "grad_norm": 0.651702880859375,
+      "kl": 0.092041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0121,
+      "num_tokens": 103582108.0,
+      "reward": 1.1218750476837158,
+      "reward_std": 0.14888589084148407,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.22388675808906555,
+      "step": 900
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1006.0,
+      "completions/max_terminated_length": 1006.0,
+      "completions/mean_length": 508.1875305175781,
+      "completions/mean_terminated_length": 508.1875305175781,
+      "completions/min_length": 308.0,
+      "completions/min_terminated_length": 308.0,
+      "epoch": 0.9295847304616972,
+      "grad_norm": 0.7340144515037537,
+      "kl": 0.093994140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0038,
+      "num_tokens": 103709893.0,
+      "reward": 1.2625001668930054,
+      "reward_std": 0.1418541967868805,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26249998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.30627813935279846,
+      "step": 901
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 751.0,
+      "completions/max_terminated_length": 751.0,
+      "completions/mean_length": 478.5625305175781,
+      "completions/mean_terminated_length": 478.5625305175781,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.9306164560226979,
+      "grad_norm": 0.771500825881958,
+      "kl": 0.091796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 103828620.0,
+      "reward": 1.2879464626312256,
+      "reward_std": 0.17468391358852386,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3123783469200134,
+      "step": 902
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 819.0,
+      "completions/max_terminated_length": 819.0,
+      "completions/mean_length": 473.7857360839844,
+      "completions/mean_terminated_length": 473.7857360839844,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.9316481815836988,
+      "grad_norm": 0.7887445092201233,
+      "kl": 0.086181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0149,
+      "num_tokens": 103944649.0,
+      "reward": 1.312500238418579,
+      "reward_std": 0.17563453316688538,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.2947360873222351,
+      "step": 903
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1001.0,
+      "completions/max_terminated_length": 1001.0,
+      "completions/mean_length": 458.6964416503906,
+      "completions/mean_terminated_length": 458.6964416503906,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.9326799071446995,
+      "grad_norm": 0.7262348532676697,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 104060647.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.13768966495990753,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3035474121570587,
+      "step": 904
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 805.0,
+      "completions/max_terminated_length": 805.0,
+      "completions/mean_length": 448.0982360839844,
+      "completions/mean_terminated_length": 448.0982360839844,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.9337116327057002,
+      "grad_norm": 0.7300897240638733,
+      "kl": 0.10791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0003,
+      "num_tokens": 104170740.0,
+      "reward": 1.2750002145767212,
+      "reward_std": 0.13089033961296082,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3060206472873688,
+      "step": 905
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 791.0,
+      "completions/max_terminated_length": 791.0,
+      "completions/mean_length": 499.607177734375,
+      "completions/mean_terminated_length": 499.607177734375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.9347433582667011,
+      "grad_norm": 0.7067446708679199,
+      "kl": 0.0953369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 104292174.0,
+      "reward": 1.296875,
+      "reward_std": 0.125174880027771,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.27877476811408997,
+      "step": 906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 918.0,
+      "completions/max_terminated_length": 918.0,
+      "completions/mean_length": 499.76788330078125,
+      "completions/mean_terminated_length": 499.76788330078125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.9357750838277018,
+      "grad_norm": 0.7178418636322021,
+      "kl": 0.1014404296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0112,
+      "num_tokens": 104416811.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.13409018516540527,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3262191116809845,
+      "step": 907
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1000.0,
+      "completions/max_terminated_length": 1000.0,
+      "completions/mean_length": 481.6339416503906,
+      "completions/mean_terminated_length": 481.6339416503906,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.9368068093887026,
+      "grad_norm": 0.7717330455780029,
+      "kl": 0.0947265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0127,
+      "num_tokens": 104539340.0,
+      "reward": 1.2906252145767212,
+      "reward_std": 0.14821362495422363,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552,
+      "step": 908
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 828.0,
+      "completions/max_terminated_length": 828.0,
+      "completions/mean_length": 508.4375305175781,
+      "completions/mean_terminated_length": 508.4375305175781,
+      "completions/min_length": 285.0,
+      "completions/min_terminated_length": 285.0,
+      "epoch": 0.9378385349497034,
+      "grad_norm": 0.6931024789810181,
+      "kl": 0.0836181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0075,
+      "num_tokens": 104664367.0,
+      "reward": 1.3062502145767212,
+      "reward_std": 0.1397494077682495,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.2900857925415039,
+      "step": 909
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1006.0,
+      "completions/max_terminated_length": 1006.0,
+      "completions/mean_length": 534.7053833007812,
+      "completions/mean_terminated_length": 534.7053833007812,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.9388702605107041,
+      "grad_norm": 0.640984296798706,
+      "kl": 0.0953369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 104789816.0,
+      "reward": 1.3406251668930054,
+      "reward_std": 0.13892993330955505,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3887222707271576,
+      "step": 910
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 466.21429443359375,
+      "completions/mean_terminated_length": 466.21429443359375,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 0.9399019860717049,
+      "grad_norm": 0.6224113702774048,
+      "kl": 0.0948486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 104908161.0,
+      "reward": 1.2973215579986572,
+      "reward_std": 0.1287800669670105,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30625003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.3359218239784241,
+      "step": 911
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 859.0,
+      "completions/max_terminated_length": 859.0,
+      "completions/mean_length": 493.58038330078125,
+      "completions/mean_terminated_length": 493.58038330078125,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.9409337116327057,
+      "grad_norm": 0.6648264527320862,
+      "kl": 0.0855712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0144,
+      "num_tokens": 105028752.0,
+      "reward": 1.2468750476837158,
+      "reward_std": 0.13067768514156342,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.30315765738487244,
+      "step": 912
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 832.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 488.169677734375,
+      "completions/mean_terminated_length": 488.169677734375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.9419654371937065,
+      "grad_norm": 0.7831220626831055,
+      "kl": 0.0931396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 105149780.0,
+      "reward": 1.275892972946167,
+      "reward_std": 0.2222750186920166,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3064711391925812,
+      "step": 913
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1377.0,
+      "completions/max_terminated_length": 1377.0,
+      "completions/mean_length": 543.3035888671875,
+      "completions/mean_terminated_length": 543.3035888671875,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "epoch": 0.9429971627547072,
+      "grad_norm": 0.7976868748664856,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0172,
+      "num_tokens": 105278362.0,
+      "reward": 1.2531250715255737,
+      "reward_std": 0.18378396332263947,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2747874855995178,
+      "step": 914
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1282.0,
+      "completions/max_terminated_length": 1282.0,
+      "completions/mean_length": 565.607177734375,
+      "completions/mean_terminated_length": 565.607177734375,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "epoch": 0.944028888315708,
+      "grad_norm": 0.6149550080299377,
+      "kl": 0.0889892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.006,
+      "num_tokens": 105419774.0,
+      "reward": 1.2218750715255737,
+      "reward_std": 0.10345561802387238,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29711687564849854,
+      "step": 915
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 972.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 491.8750305175781,
+      "completions/mean_terminated_length": 491.8750305175781,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.9450606138767088,
+      "grad_norm": 0.782224714756012,
+      "kl": 0.0875244140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0176,
+      "num_tokens": 105543814.0,
+      "reward": 1.4281251430511475,
+      "reward_std": 0.1730240136384964,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4281249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.2923022508621216,
+      "step": 916
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2439.0,
+      "completions/max_terminated_length": 2439.0,
+      "completions/mean_length": 503.6964416503906,
+      "completions/mean_terminated_length": 503.6964416503906,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.9460923394377095,
+      "grad_norm": 0.786839485168457,
+      "kl": 0.0928955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0078,
+      "num_tokens": 105672468.0,
+      "reward": 1.3031251430511475,
+      "reward_std": 0.1552998125553131,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3031249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.30263715982437134,
+      "step": 917
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 884.0,
+      "completions/max_terminated_length": 884.0,
+      "completions/mean_length": 515.0982666015625,
+      "completions/mean_terminated_length": 515.0982666015625,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "epoch": 0.9471240649987104,
+      "grad_norm": 0.7849997878074646,
+      "kl": 0.0869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0216,
+      "num_tokens": 105798969.0,
+      "reward": 1.2937501668930054,
+      "reward_std": 0.2153172492980957,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3064711391925812,
+      "step": 918
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 899.0,
+      "completions/max_terminated_length": 899.0,
+      "completions/mean_length": 458.0714416503906,
+      "completions/mean_terminated_length": 458.0714416503906,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.9481557905597111,
+      "grad_norm": 0.604444682598114,
+      "kl": 0.084228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 105912510.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.11152077466249466,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.32596227526664734,
+      "step": 919
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1160.0,
+      "completions/max_terminated_length": 1160.0,
+      "completions/mean_length": 500.7232360839844,
+      "completions/mean_terminated_length": 500.7232360839844,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 0.9491875161207118,
+      "grad_norm": 0.7500540018081665,
+      "kl": 0.097412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 106034706.0,
+      "reward": 1.309375286102295,
+      "reward_std": 0.15969355404376984,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.29243704676628113,
+      "step": 920
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1385.0,
+      "completions/max_terminated_length": 1385.0,
+      "completions/mean_length": 535.7678833007812,
+      "completions/mean_terminated_length": 535.7678833007812,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 0.9502192416817127,
+      "grad_norm": 0.8850662112236023,
+      "kl": 0.0933837890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0163,
+      "num_tokens": 106158363.0,
+      "reward": 1.3375000953674316,
+      "reward_std": 0.11517629027366638,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33750003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.32525110244750977,
+      "step": 921
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1005.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 513.9017944335938,
+      "completions/mean_terminated_length": 513.9017944335938,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 0.9512509672427134,
+      "grad_norm": 0.6601965427398682,
+      "kl": 0.088134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0051,
+      "num_tokens": 106284660.0,
+      "reward": 1.2781251668930054,
+      "reward_std": 0.16729682683944702,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3156418800354004,
+      "step": 922
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 972.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 474.6250305175781,
+      "completions/mean_terminated_length": 474.6250305175781,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.9522826928037142,
+      "grad_norm": 0.8933461904525757,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0261,
+      "num_tokens": 106400034.0,
+      "reward": 1.3375002145767212,
+      "reward_std": 0.21545302867889404,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30781853199005127,
+      "step": 923
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1058.0,
+      "completions/max_terminated_length": 1058.0,
+      "completions/mean_length": 517.5803833007812,
+      "completions/mean_terminated_length": 517.5803833007812,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "epoch": 0.953314418364715,
+      "grad_norm": 0.5526024103164673,
+      "kl": 0.0885009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0,
+      "num_tokens": 106525028.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.09949037432670593,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378,
+      "step": 924
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 523.7767944335938,
+      "completions/mean_terminated_length": 523.7767944335938,
+      "completions/min_length": 320.0,
+      "completions/min_terminated_length": 320.0,
+      "epoch": 0.9543461439257157,
+      "grad_norm": 0.7154967188835144,
+      "kl": 0.0843505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 106653523.0,
+      "reward": 1.2660716772079468,
+      "reward_std": 0.21331168711185455,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.32355013489723206,
+      "step": 925
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 901.0,
+      "completions/max_terminated_length": 901.0,
+      "completions/mean_length": 449.64288330078125,
+      "completions/mean_terminated_length": 449.64288330078125,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.9553778694867165,
+      "grad_norm": 0.6225499510765076,
+      "kl": 0.0948486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.026,
+      "num_tokens": 106772896.0,
+      "reward": 1.4000002145767212,
+      "reward_std": 0.10782329738140106,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4000000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31465697288513184,
+      "step": 926
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 904.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 445.544677734375,
+      "completions/mean_terminated_length": 445.544677734375,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 0.9564095950477173,
+      "grad_norm": 0.8484821915626526,
+      "kl": 0.095458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0083,
+      "num_tokens": 106885286.0,
+      "reward": 1.3437501192092896,
+      "reward_std": 0.17554441094398499,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.32543283700942993,
+      "step": 927
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 770.0,
+      "completions/max_terminated_length": 770.0,
+      "completions/mean_length": 463.8750305175781,
+      "completions/mean_terminated_length": 463.8750305175781,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 0.9574413206087181,
+      "grad_norm": 0.7585766911506653,
+      "kl": 0.0902099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0175,
+      "num_tokens": 107002613.0,
+      "reward": 1.3562501668930054,
+      "reward_std": 0.17320376634597778,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.30440643429756165,
+      "step": 928
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 897.0,
+      "completions/max_terminated_length": 897.0,
+      "completions/mean_length": 527.857177734375,
+      "completions/mean_terminated_length": 527.857177734375,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.9584730461697188,
+      "grad_norm": 0.6367157101631165,
+      "kl": 0.0794677734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0096,
+      "num_tokens": 107130872.0,
+      "reward": 1.2468751668930054,
+      "reward_std": 0.16819968819618225,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2920324206352234,
+      "step": 929
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1474.0,
+      "completions/max_terminated_length": 1474.0,
+      "completions/mean_length": 537.3214721679688,
+      "completions/mean_terminated_length": 537.3214721679688,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "epoch": 0.9595047717307196,
+      "grad_norm": 0.7050769329071045,
+      "kl": 0.088623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0027,
+      "num_tokens": 107258133.0,
+      "reward": 1.3343751430511475,
+      "reward_std": 0.19970746338367462,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.32680758833885193,
+      "step": 930
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 860.0,
+      "completions/max_terminated_length": 860.0,
+      "completions/mean_length": 512.4017944335938,
+      "completions/mean_terminated_length": 512.4017944335938,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 0.9605364972917204,
+      "grad_norm": 0.63140469789505,
+      "kl": 0.0830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 107384787.0,
+      "reward": 1.1968750953674316,
+      "reward_std": 0.09665969759225845,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2963198721408844,
+      "step": 931
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1245.0,
+      "completions/max_terminated_length": 1245.0,
+      "completions/mean_length": 490.1785888671875,
+      "completions/mean_terminated_length": 490.1785888671875,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 0.9615682228527211,
+      "grad_norm": 0.7270257472991943,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 107502001.0,
+      "reward": 1.2723214626312256,
+      "reward_std": 0.1774478703737259,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28125,
+      "rewards/curriculum_aware_reward_fn/std": 0.30023449659347534,
+      "step": 932
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1106.0,
+      "completions/max_terminated_length": 1106.0,
+      "completions/mean_length": 521.5982666015625,
+      "completions/mean_terminated_length": 521.5982666015625,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 0.962599948413722,
+      "grad_norm": 0.7357829213142395,
+      "kl": 0.1044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0343,
+      "num_tokens": 107634711.0,
+      "reward": 1.1973215341567993,
+      "reward_std": 0.1833607256412506,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20624998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2840445339679718,
+      "step": 933
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1322.0,
+      "completions/max_terminated_length": 1322.0,
+      "completions/mean_length": 479.90179443359375,
+      "completions/mean_terminated_length": 479.90179443359375,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 0.9636316739747227,
+      "grad_norm": 0.6669960021972656,
+      "kl": 0.09765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0036,
+      "num_tokens": 107757557.0,
+      "reward": 1.3968751430511475,
+      "reward_std": 0.14331825077533722,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39687496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.30263715982437134,
+      "step": 934
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 852.0,
+      "completions/max_terminated_length": 852.0,
+      "completions/mean_length": 458.20538330078125,
+      "completions/mean_terminated_length": 458.20538330078125,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 0.9646633995357234,
+      "grad_norm": 0.5764197111129761,
+      "kl": 0.0992431640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0044,
+      "num_tokens": 107869097.0,
+      "reward": 1.3312500715255737,
+      "reward_std": 0.10730495303869247,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3145943284034729,
+      "step": 935
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2205.0,
+      "completions/max_terminated_length": 2205.0,
+      "completions/mean_length": 528.294677734375,
+      "completions/mean_terminated_length": 528.294677734375,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.9656951250967243,
+      "grad_norm": 0.7136504650115967,
+      "kl": 0.0906982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 107996614.0,
+      "reward": 1.171875,
+      "reward_std": 0.18532191216945648,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.2657456696033478,
+      "step": 936
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1406.0,
+      "completions/max_terminated_length": 1406.0,
+      "completions/mean_length": 549.482177734375,
+      "completions/mean_terminated_length": 549.482177734375,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "epoch": 0.966726850657725,
+      "grad_norm": 0.7116082906723022,
+      "kl": 0.0865478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0004,
+      "num_tokens": 108123494.0,
+      "reward": 1.28125,
+      "reward_std": 0.16612868010997772,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28125,
+      "rewards/curriculum_aware_reward_fn/std": 0.31106650829315186,
+      "step": 937
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1171.0,
+      "completions/max_terminated_length": 1171.0,
+      "completions/mean_length": 495.7232360839844,
+      "completions/mean_terminated_length": 495.7232360839844,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.9677585762187259,
+      "grad_norm": 0.760637640953064,
+      "kl": 0.0909423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0004,
+      "num_tokens": 108245816.0,
+      "reward": 1.328125,
+      "reward_std": 0.21447554230690002,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.328125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3994241952896118,
+      "step": 938
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1347.0,
+      "completions/max_terminated_length": 1347.0,
+      "completions/mean_length": 536.9375,
+      "completions/mean_terminated_length": 536.9375,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.9687903017797266,
+      "grad_norm": 0.6102328300476074,
+      "kl": 0.087158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0146,
+      "num_tokens": 108379873.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.1227864921092987,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3135904371738434,
+      "step": 939
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1049.0,
+      "completions/max_terminated_length": 1049.0,
+      "completions/mean_length": 508.4107360839844,
+      "completions/mean_terminated_length": 508.4107360839844,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.9698220273407274,
+      "grad_norm": 0.6803673505783081,
+      "kl": 0.09033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 108501726.0,
+      "reward": 1.3562501668930054,
+      "reward_std": 0.12399572134017944,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.31157293915748596,
+      "step": 940
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1734.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 502.71429443359375,
+      "completions/mean_terminated_length": 502.71429443359375,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 0.9708537529017282,
+      "grad_norm": 0.5645270943641663,
+      "kl": 0.099853515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 108627841.0,
+      "reward": 1.3000000715255737,
+      "reward_std": 0.0953846201300621,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3181449770927429,
+      "step": 941
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 789.0,
+      "completions/max_terminated_length": 789.0,
+      "completions/mean_length": 484.02679443359375,
+      "completions/mean_terminated_length": 484.02679443359375,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 0.9718854784627289,
+      "grad_norm": 0.7351107597351074,
+      "kl": 0.100830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0055,
+      "num_tokens": 108751107.0,
+      "reward": 1.2406251430511475,
+      "reward_std": 0.13194237649440765,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24062500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.28972890973091125,
+      "step": 942
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 473.5089416503906,
+      "completions/mean_terminated_length": 473.5089416503906,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.9729172040237297,
+      "grad_norm": 0.74190354347229,
+      "kl": 0.099853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.008,
+      "num_tokens": 108866685.0,
+      "reward": 1.2437502145767212,
+      "reward_std": 0.1013740599155426,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24374999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.3074982464313507,
+      "step": 943
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 760.0,
+      "completions/max_terminated_length": 760.0,
+      "completions/mean_length": 501.08038330078125,
+      "completions/mean_terminated_length": 501.08038330078125,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 0.9739489295847304,
+      "grad_norm": 0.7810709476470947,
+      "kl": 0.1024169921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0204,
+      "num_tokens": 108995851.0,
+      "reward": 1.2062500715255737,
+      "reward_std": 0.15930792689323425,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20624999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.2840445339679718,
+      "step": 944
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 997.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 513.3125,
+      "completions/mean_terminated_length": 513.3125,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.9749806551457313,
+      "grad_norm": 0.8026859760284424,
+      "kl": 0.093505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0276,
+      "num_tokens": 109120353.0,
+      "reward": 1.3375002145767212,
+      "reward_std": 0.1909678429365158,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31490740180015564,
+      "step": 945
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 690.0,
+      "completions/max_terminated_length": 690.0,
+      "completions/mean_length": 467.669677734375,
+      "completions/mean_terminated_length": 467.669677734375,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 0.976012380706732,
+      "grad_norm": 0.6618792414665222,
+      "kl": 0.1016845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0008,
+      "num_tokens": 109237776.0,
+      "reward": 1.3125,
+      "reward_std": 0.12639063596725464,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.30935126543045044,
+      "step": 946
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 974.0,
+      "completions/max_terminated_length": 974.0,
+      "completions/mean_length": 487.83038330078125,
+      "completions/mean_terminated_length": 487.83038330078125,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.9770441062677327,
+      "grad_norm": 0.810634195804596,
+      "kl": 0.09619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0166,
+      "num_tokens": 109348672.0,
+      "reward": 1.340625286102295,
+      "reward_std": 0.16616889834403992,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.316763699054718,
+      "step": 947
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 900.0,
+      "completions/max_terminated_length": 900.0,
+      "completions/mean_length": 480.8125305175781,
+      "completions/mean_terminated_length": 480.8125305175781,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.9780758318287336,
+      "grad_norm": 0.7666967511177063,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0079,
+      "num_tokens": 109470493.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.13620880246162415,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31874996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.295470654964447,
+      "step": 948
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 490.2857360839844,
+      "completions/mean_terminated_length": 490.2857360839844,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 0.9791075573897343,
+      "grad_norm": 0.8252652287483215,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 109593396.0,
+      "reward": 1.3191965818405151,
+      "reward_std": 0.166405588388443,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.328125,
+      "rewards/curriculum_aware_reward_fn/std": 0.31614094972610474,
+      "step": 949
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 742.0,
+      "completions/max_terminated_length": 742.0,
+      "completions/mean_length": 455.5982360839844,
+      "completions/mean_terminated_length": 455.5982360839844,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 0.9801392829507352,
+      "grad_norm": 0.9774068593978882,
+      "kl": 0.107177734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0004,
+      "num_tokens": 109720835.0,
+      "reward": 1.2093751430511475,
+      "reward_std": 0.22767837345600128,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20937499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.27578970789909363,
+      "step": 950
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 892.0,
+      "completions/max_terminated_length": 892.0,
+      "completions/mean_length": 465.95538330078125,
+      "completions/mean_terminated_length": 465.95538330078125,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.9811710085117359,
+      "grad_norm": 0.7859709858894348,
+      "kl": 0.115966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 109841398.0,
+      "reward": 1.2906252145767212,
+      "reward_std": 0.15378578007221222,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30406635999679565,
+      "step": 951
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1094.0,
+      "completions/max_terminated_length": 1094.0,
+      "completions/mean_length": 465.419677734375,
+      "completions/mean_terminated_length": 465.419677734375,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 0.9822027340727366,
+      "grad_norm": 0.7218555808067322,
+      "kl": 0.097412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 109956700.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.12293457984924316,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.295470654964447,
+      "step": 952
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 935.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 536.8482666015625,
+      "completions/mean_terminated_length": 536.8482666015625,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 0.9832344596337375,
+      "grad_norm": 0.6031023263931274,
+      "kl": 0.08154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0244,
+      "num_tokens": 110087401.0,
+      "reward": 1.1843751668930054,
+      "reward_std": 0.15871918201446533,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18437497317790985,
+      "rewards/curriculum_aware_reward_fn/std": 0.27378159761428833,
+      "step": 953
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 443.5089416503906,
+      "completions/mean_terminated_length": 443.5089416503906,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 0.9842661851947382,
+      "grad_norm": 0.5875775814056396,
+      "kl": 0.1033935546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0061,
+      "num_tokens": 110196931.0,
+      "reward": 1.2504466772079468,
+      "reward_std": 0.07587282359600067,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2593750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.32121190428733826,
+      "step": 954
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 454.5535888671875,
+      "completions/mean_terminated_length": 454.5535888671875,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.985297910755739,
+      "grad_norm": 0.9105221033096313,
+      "kl": 0.100341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0029,
+      "num_tokens": 110320515.0,
+      "reward": 1.2937500476837158,
+      "reward_std": 0.21526940166950226,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3135904371738434,
+      "step": 955
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 873.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 453.607177734375,
+      "completions/mean_terminated_length": 453.607177734375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.9863296363167398,
+      "grad_norm": 0.7779031991958618,
+      "kl": 0.107666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0369,
+      "num_tokens": 110435467.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.15156292915344238,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3064711391925812,
+      "step": 956
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 811.0,
+      "completions/max_terminated_length": 811.0,
+      "completions/mean_length": 516.5803833007812,
+      "completions/mean_terminated_length": 516.5803833007812,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.9873613618777405,
+      "grad_norm": 0.8285189867019653,
+      "kl": 0.0853271484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0022,
+      "num_tokens": 110568476.0,
+      "reward": 1.2718751430511475,
+      "reward_std": 0.19677434861660004,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.29975825548171997,
+      "step": 957
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 967.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 474.0982360839844,
+      "completions/mean_terminated_length": 474.0982360839844,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 0.9883930874387413,
+      "grad_norm": 0.8431401252746582,
+      "kl": 0.1002197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 110690413.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.1829722374677658,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34999996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31515759229660034,
+      "step": 958
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 912.0,
+      "completions/max_terminated_length": 912.0,
+      "completions/mean_length": 449.0625305175781,
+      "completions/mean_terminated_length": 449.0625305175781,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.989424812999742,
+      "grad_norm": 0.7426992058753967,
+      "kl": 0.1060791015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0066,
+      "num_tokens": 110798381.0,
+      "reward": 1.296875,
+      "reward_std": 0.12052858620882034,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3228031396865845,
+      "step": 959
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1435.0,
+      "completions/max_terminated_length": 1435.0,
+      "completions/mean_length": 451.607177734375,
+      "completions/mean_terminated_length": 451.607177734375,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 0.9904565385607429,
+      "grad_norm": 0.7545911073684692,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 110920053.0,
+      "reward": 1.3285715579986572,
+      "reward_std": 0.1768074631690979,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31839263439178467,
+      "step": 960
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 466.8482360839844,
+      "completions/mean_terminated_length": 466.8482360839844,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.9914882641217436,
+      "grad_norm": 0.6909697651863098,
+      "kl": 0.09912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 111045826.0,
+      "reward": 1.2718751430511475,
+      "reward_std": 0.10017166286706924,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.28850194811820984,
+      "step": 961
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 862.0,
+      "completions/max_terminated_length": 862.0,
+      "completions/mean_length": 434.5446472167969,
+      "completions/mean_terminated_length": 434.5446472167969,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 0.9925199896827444,
+      "grad_norm": 0.8224356770515442,
+      "kl": 0.1121826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0283,
+      "num_tokens": 111161142.0,
+      "reward": 1.3437501192092896,
+      "reward_std": 0.1928206980228424,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728,
+      "step": 962
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 780.0,
+      "completions/max_terminated_length": 780.0,
+      "completions/mean_length": 444.0357360839844,
+      "completions/mean_terminated_length": 444.0357360839844,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.9935517152437452,
+      "grad_norm": 0.7404220700263977,
+      "kl": 0.0989990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 111280391.0,
+      "reward": 1.2531250715255737,
+      "reward_std": 0.16437771916389465,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29791173338890076,
+      "step": 963
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1311.0,
+      "completions/max_terminated_length": 1311.0,
+      "completions/mean_length": 490.732177734375,
+      "completions/mean_terminated_length": 490.732177734375,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.9945834408047459,
+      "grad_norm": 0.6985900402069092,
+      "kl": 0.0899658203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0199,
+      "num_tokens": 111401993.0,
+      "reward": 1.312500238418579,
+      "reward_std": 0.18294866383075714,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3267020583152771,
+      "step": 964
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 994.0,
+      "completions/max_terminated_length": 994.0,
+      "completions/mean_length": 453.65179443359375,
+      "completions/mean_terminated_length": 453.65179443359375,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 0.9956151663657468,
+      "grad_norm": 0.7975105047225952,
+      "kl": 0.1044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 111515641.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.11395810544490814,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.30997174978256226,
+      "step": 965
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1003.0,
+      "completions/mean_length": 473.71429443359375,
+      "completions/mean_terminated_length": 441.0810852050781,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 0.9966468919267475,
+      "grad_norm": 0.8458168506622314,
+      "kl": 0.1005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0504,
+      "num_tokens": 111637472.0,
+      "reward": 1.4004465341567993,
+      "reward_std": 0.22842104732990265,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40937498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3147665560245514,
+      "step": 966
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 987.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 471.33929443359375,
+      "completions/mean_terminated_length": 471.33929443359375,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 0.9976786174877482,
+      "grad_norm": 0.7415000200271606,
+      "kl": 0.0968017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0029,
+      "num_tokens": 111761053.0,
+      "reward": 1.2250001430511475,
+      "reward_std": 0.16037410497665405,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.29660236835479736,
+      "step": 967
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 873.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 454.669677734375,
+      "completions/mean_terminated_length": 454.669677734375,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.9987103430487491,
+      "grad_norm": 0.7366511225700378,
+      "kl": 0.0972900390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 111882340.0,
+      "reward": 1.3093750476837158,
+      "reward_std": 0.0987308993935585,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.31073373556137085,
+      "step": 968
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 838.0,
+      "completions/max_terminated_length": 838.0,
+      "completions/mean_length": 385.7599792480469,
+      "completions/mean_terminated_length": 385.7599792480469,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 0.9997420686097498,
+      "grad_norm": 0.7283992171287537,
+      "kl": 0.1048583984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 112005959.0,
+      "reward": 1.3343751430511475,
+      "reward_std": 0.08949775248765945,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3058757483959198,
+      "step": 969
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 921.0,
+      "completions/max_terminated_length": 921.0,
+      "completions/mean_length": 431.3750305175781,
+      "completions/mean_terminated_length": 431.3750305175781,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 1.0010317255610008,
+      "grad_norm": 0.8628621101379395,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0084,
+      "num_tokens": 112121459.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.20198066532611847,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.29042527079582214,
+      "step": 970
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 815.0,
+      "completions/max_terminated_length": 815.0,
+      "completions/mean_length": 410.6785888671875,
+      "completions/mean_terminated_length": 410.6785888671875,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 1.0020634511220015,
+      "grad_norm": 0.7795261144638062,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0124,
+      "num_tokens": 112241060.0,
+      "reward": 1.3843750953674316,
+      "reward_std": 0.13455899059772491,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3843750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31501689553260803,
+      "step": 971
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 637.0,
+      "completions/max_terminated_length": 637.0,
+      "completions/mean_length": 373.27679443359375,
+      "completions/mean_terminated_length": 373.27679443359375,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 1.0030951766830023,
+      "grad_norm": 0.7977780699729919,
+      "kl": 0.1087646484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0185,
+      "num_tokens": 112337999.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.19644302129745483,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3123941123485565,
+      "step": 972
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1072.0,
+      "completions/max_terminated_length": 1072.0,
+      "completions/mean_length": 423.8482360839844,
+      "completions/mean_terminated_length": 423.8482360839844,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 1.0041269022440031,
+      "grad_norm": 0.7241541147232056,
+      "kl": 0.1025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 112453216.0,
+      "reward": 1.3000000715255737,
+      "reward_std": 0.08505426347255707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29660236835479736,
+      "step": 973
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1138.0,
+      "completions/max_terminated_length": 1138.0,
+      "completions/mean_length": 449.33929443359375,
+      "completions/mean_terminated_length": 449.33929443359375,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.005158627805004,
+      "grad_norm": 0.6764619946479797,
+      "kl": 0.1170654296875,
+      "learning_rate": 1e-06,
+      "loss": -0.014,
+      "num_tokens": 112571680.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.1194203794002533,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.32873159646987915,
+      "step": 974
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 747.0,
+      "completions/max_terminated_length": 747.0,
+      "completions/mean_length": 380.96429443359375,
+      "completions/mean_terminated_length": 380.96429443359375,
+      "completions/min_length": 120.0,
+      "completions/min_terminated_length": 120.0,
+      "epoch": 1.0061903533660046,
+      "grad_norm": 0.7338297963142395,
+      "kl": 0.1099853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 112679644.0,
+      "reward": 1.4093750715255737,
+      "reward_std": 0.1461634635925293,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40937498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3147665560245514,
+      "step": 975
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1149.0,
+      "completions/max_terminated_length": 1149.0,
+      "completions/mean_length": 400.46429443359375,
+      "completions/mean_terminated_length": 400.46429443359375,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.0072220789270054,
+      "grad_norm": 0.7369305491447449,
+      "kl": 0.1041259765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 112792788.0,
+      "reward": 1.2906250953674316,
+      "reward_std": 0.13024835288524628,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552,
+      "step": 976
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 989.0,
+      "completions/max_terminated_length": 989.0,
+      "completions/mean_length": 425.5089416503906,
+      "completions/mean_terminated_length": 425.5089416503906,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 1.0082538044880063,
+      "grad_norm": 0.6680691838264465,
+      "kl": 0.0927734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0043,
+      "num_tokens": 112908002.0,
+      "reward": 1.231250286102295,
+      "reward_std": 0.11469794809818268,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23125000298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.31005123257637024,
+      "step": 977
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 568.0,
+      "completions/max_terminated_length": 568.0,
+      "completions/mean_length": 371.0446472167969,
+      "completions/mean_terminated_length": 371.0446472167969,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 1.009285530049007,
+      "grad_norm": 0.8868218064308167,
+      "kl": 0.1068115234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0073,
+      "num_tokens": 113017130.0,
+      "reward": 1.4406250715255737,
+      "reward_std": 0.15701918303966522,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29243701696395874,
+      "step": 978
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1042.0,
+      "completions/max_terminated_length": 1042.0,
+      "completions/mean_length": 428.21429443359375,
+      "completions/mean_terminated_length": 428.21429443359375,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 1.0103172556100077,
+      "grad_norm": 0.9351738095283508,
+      "kl": 0.096435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0108,
+      "num_tokens": 113132682.0,
+      "reward": 1.2781251668930054,
+      "reward_std": 0.1593979001045227,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.2824268341064453,
+      "step": 979
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 401.1696472167969,
+      "completions/mean_terminated_length": 401.1696472167969,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.0113489811710086,
+      "grad_norm": 0.6989129185676575,
+      "kl": 0.106201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 113246479.0,
+      "reward": 1.1906250715255737,
+      "reward_std": 0.12484891712665558,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19062498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2853424549102783,
+      "step": 980
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 627.0,
+      "completions/max_terminated_length": 627.0,
+      "completions/mean_length": 375.8839416503906,
+      "completions/mean_terminated_length": 375.8839416503906,
+      "completions/min_length": 142.0,
+      "completions/min_terminated_length": 142.0,
+      "epoch": 1.0123807067320092,
+      "grad_norm": 1.0648372173309326,
+      "kl": 0.1033935546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 113351592.0,
+      "reward": 1.3910716772079468,
+      "reward_std": 0.2463470697402954,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4000000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.39266932010650635,
+      "step": 981
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 698.0,
+      "completions/max_terminated_length": 698.0,
+      "completions/mean_length": 395.5357360839844,
+      "completions/mean_terminated_length": 395.5357360839844,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 1.01341243229301,
+      "grad_norm": 0.8360050320625305,
+      "kl": 0.1005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 113455371.0,
+      "reward": 1.3562501668930054,
+      "reward_std": 0.16068443655967712,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3185782730579376,
+      "step": 982
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 699.0,
+      "completions/max_terminated_length": 699.0,
+      "completions/mean_length": 398.9910888671875,
+      "completions/mean_terminated_length": 398.9910888671875,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 1.0144441578540109,
+      "grad_norm": 0.8212693929672241,
+      "kl": 0.107177734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0121,
+      "num_tokens": 113563100.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.16679814457893372,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.2979282736778259,
+      "step": 983
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1056.0,
+      "completions/max_terminated_length": 1056.0,
+      "completions/mean_length": 427.5982360839844,
+      "completions/mean_terminated_length": 427.5982360839844,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.0154758834150117,
+      "grad_norm": 0.7585737705230713,
+      "kl": 0.0975341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 113681530.0,
+      "reward": 1.25,
+      "reward_std": 0.12833638489246368,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474,
+      "step": 984
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1355.0,
+      "completions/max_terminated_length": 1355.0,
+      "completions/mean_length": 390.95538330078125,
+      "completions/mean_terminated_length": 390.95538330078125,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 1.0165076089760123,
+      "grad_norm": 0.9035147428512573,
+      "kl": 0.100830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0315,
+      "num_tokens": 113783221.0,
+      "reward": 1.343750238418579,
+      "reward_std": 0.12016370892524719,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728,
+      "step": 985
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 877.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 403.9732360839844,
+      "completions/mean_terminated_length": 403.9732360839844,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.0175393345370132,
+      "grad_norm": 0.8435715436935425,
+      "kl": 0.0992431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0296,
+      "num_tokens": 113895435.0,
+      "reward": 1.2816966772079468,
+      "reward_std": 0.11472270637750626,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3182533085346222,
+      "step": 986
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 692.0,
+      "completions/max_terminated_length": 692.0,
+      "completions/mean_length": 405.1964416503906,
+      "completions/mean_terminated_length": 405.1964416503906,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "epoch": 1.018571060098014,
+      "grad_norm": 0.8098063468933105,
+      "kl": 0.103759765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0196,
+      "num_tokens": 114014128.0,
+      "reward": 1.2468751668930054,
+      "reward_std": 0.1147625669836998,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.2994951903820038,
+      "step": 987
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 600.0,
+      "completions/max_terminated_length": 600.0,
+      "completions/mean_length": 376.3482360839844,
+      "completions/mean_terminated_length": 376.3482360839844,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 1.0196027856590146,
+      "grad_norm": 0.7244241237640381,
+      "kl": 0.1068115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0154,
+      "num_tokens": 114123374.0,
+      "reward": 1.2906252145767212,
+      "reward_std": 0.10551070421934128,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552,
+      "step": 988
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 431.0357360839844,
+      "completions/mean_terminated_length": 431.0357360839844,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.0206345112200155,
+      "grad_norm": 0.8381537199020386,
+      "kl": 0.1102294921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0137,
+      "num_tokens": 114237177.0,
+      "reward": 1.3343751430511475,
+      "reward_std": 0.11328289657831192,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3094627261161804,
+      "step": 989
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 827.0,
+      "completions/max_terminated_length": 827.0,
+      "completions/mean_length": 435.4910888671875,
+      "completions/mean_terminated_length": 435.4910888671875,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 1.0216662367810163,
+      "grad_norm": 0.8157406449317932,
+      "kl": 0.101318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0178,
+      "num_tokens": 114354958.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.16996197402477264,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3016097843647003,
+      "step": 990
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 909.0,
+      "completions/max_terminated_length": 909.0,
+      "completions/mean_length": 390.4910888671875,
+      "completions/mean_terminated_length": 390.4910888671875,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 1.022697962342017,
+      "grad_norm": 0.8390117287635803,
+      "kl": 0.0965576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.017,
+      "num_tokens": 114459852.0,
+      "reward": 1.2781251668930054,
+      "reward_std": 0.1394372582435608,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.31212589144706726,
+      "step": 991
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 875.0,
+      "completions/max_terminated_length": 875.0,
+      "completions/mean_length": 416.14288330078125,
+      "completions/mean_terminated_length": 416.14288330078125,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.0237296879030178,
+      "grad_norm": 1.0235214233398438,
+      "kl": 0.0997314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 114565302.0,
+      "reward": 1.4406250715255737,
+      "reward_std": 0.20136210322380066,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.28478938341140747,
+      "step": 992
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1049.0,
+      "completions/max_terminated_length": 1049.0,
+      "completions/mean_length": 459.7857360839844,
+      "completions/mean_terminated_length": 459.7857360839844,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 1.0247614134640186,
+      "grad_norm": 0.8590009808540344,
+      "kl": 0.10009765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0003,
+      "num_tokens": 114687589.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.1746867150068283,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3228183686733246,
+      "step": 993
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 860.0,
+      "completions/max_terminated_length": 860.0,
+      "completions/mean_length": 406.4732360839844,
+      "completions/mean_terminated_length": 406.4732360839844,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 1.0257931390250195,
+      "grad_norm": 0.6487825512886047,
+      "kl": 0.0975341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0204,
+      "num_tokens": 114803793.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.11713527143001556,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.33467283844947815,
+      "step": 994
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 826.0,
+      "completions/max_terminated_length": 826.0,
+      "completions/mean_length": 418.6875305175781,
+      "completions/mean_terminated_length": 418.6875305175781,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.02682486458602,
+      "grad_norm": 0.6680878400802612,
+      "kl": 0.096923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0145,
+      "num_tokens": 114917662.0,
+      "reward": 1.2125000953674316,
+      "reward_std": 0.10435570031404495,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2947360873222351,
+      "step": 995
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 738.0,
+      "completions/max_terminated_length": 738.0,
+      "completions/mean_length": 440.0714416503906,
+      "completions/mean_terminated_length": 440.0714416503906,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 1.027856590147021,
+      "grad_norm": 0.8054993152618408,
+      "kl": 0.094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0191,
+      "num_tokens": 115030440.0,
+      "reward": 1.3593751192092896,
+      "reward_std": 0.13213443756103516,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.2988364100456238,
+      "step": 996
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1444.0,
+      "completions/max_terminated_length": 1444.0,
+      "completions/mean_length": 443.6250305175781,
+      "completions/mean_terminated_length": 443.6250305175781,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 1.0288883157080218,
+      "grad_norm": 0.734188437461853,
+      "kl": 0.0919189453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0174,
+      "num_tokens": 115150728.0,
+      "reward": 1.2937501668930054,
+      "reward_std": 0.1578095704317093,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.31005120277404785,
+      "step": 997
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 805.0,
+      "completions/max_terminated_length": 805.0,
+      "completions/mean_length": 461.1964416503906,
+      "completions/mean_terminated_length": 461.1964416503906,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 1.0299200412690224,
+      "grad_norm": 0.7558555603027344,
+      "kl": 0.0980224609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0079,
+      "num_tokens": 115279081.0,
+      "reward": 1.3250000476837158,
+      "reward_std": 0.13827043771743774,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.30704930424690247,
+      "step": 998
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 832.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 406.14288330078125,
+      "completions/mean_terminated_length": 406.14288330078125,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 1.0309517668300232,
+      "grad_norm": 0.8551933169364929,
+      "kl": 0.0989990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 115391874.0,
+      "reward": 1.3593751192092896,
+      "reward_std": 0.13182500004768372,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3025068938732147,
+      "step": 999
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 738.0,
+      "completions/max_terminated_length": 738.0,
+      "completions/mean_length": 403.70538330078125,
+      "completions/mean_terminated_length": 403.70538330078125,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 1.031983492391024,
+      "grad_norm": 0.8113073706626892,
+      "kl": 0.1077880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0096,
+      "num_tokens": 115502399.0,
+      "reward": 1.3312500715255737,
+      "reward_std": 0.15864074230194092,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.32153382897377014,
+      "step": 1000
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1075.0,
+      "completions/max_terminated_length": 1075.0,
+      "completions/mean_length": 444.607177734375,
+      "completions/mean_terminated_length": 444.607177734375,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 1.0330152179520247,
+      "grad_norm": 0.778337299823761,
+      "kl": 0.099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0138,
+      "num_tokens": 115610512.0,
+      "reward": 1.265625238418579,
+      "reward_std": 0.1536625176668167,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.265625,
+      "rewards/curriculum_aware_reward_fn/std": 0.2867204546928406,
+      "step": 1001
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 983.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 426.5000305175781,
+      "completions/mean_terminated_length": 426.5000305175781,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.0340469435130255,
+      "grad_norm": 0.7531226873397827,
+      "kl": 0.1019287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0246,
+      "num_tokens": 115723513.0,
+      "reward": 1.390625238418579,
+      "reward_std": 0.13248291611671448,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.390625,
+      "rewards/curriculum_aware_reward_fn/std": 0.28478941321372986,
+      "step": 1002
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 914.0,
+      "completions/max_terminated_length": 914.0,
+      "completions/mean_length": 430.8214416503906,
+      "completions/mean_terminated_length": 430.8214416503906,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.0350786690740263,
+      "grad_norm": 0.8895756006240845,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0084,
+      "num_tokens": 115835367.0,
+      "reward": 1.1968750953674316,
+      "reward_std": 0.16403761506080627,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2925718128681183,
+      "step": 1003
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 978.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 489.96429443359375,
+      "completions/mean_terminated_length": 489.96429443359375,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.0361103946350272,
+      "grad_norm": 0.611074686050415,
+      "kl": 0.085693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0146,
+      "num_tokens": 115972710.0,
+      "reward": 1.1343750953674316,
+      "reward_std": 0.10016217827796936,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2061040699481964,
+      "step": 1004
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 413.2500305175781,
+      "completions/mean_terminated_length": 413.2500305175781,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.0371421201960278,
+      "grad_norm": 0.7905242443084717,
+      "kl": 0.0982666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 116086103.0,
+      "reward": 1.2718751430511475,
+      "reward_std": 0.15575583279132843,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27187496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31060686707496643,
+      "step": 1005
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 701.0,
+      "completions/max_terminated_length": 701.0,
+      "completions/mean_length": 424.6696472167969,
+      "completions/mean_terminated_length": 424.6696472167969,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 1.0381738457570286,
+      "grad_norm": 0.92307049036026,
+      "kl": 0.11328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0073,
+      "num_tokens": 116206452.0,
+      "reward": 1.3062502145767212,
+      "reward_std": 0.19037002325057983,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3791363537311554,
+      "step": 1006
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 744.0,
+      "completions/max_terminated_length": 744.0,
+      "completions/mean_length": 427.5714416503906,
+      "completions/mean_terminated_length": 427.5714416503906,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 1.0392055713180295,
+      "grad_norm": 0.6816158294677734,
+      "kl": 0.1048583984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0107,
+      "num_tokens": 116319797.0,
+      "reward": 1.3375002145767212,
+      "reward_std": 0.1783868372440338,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.37268805503845215,
+      "step": 1007
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 763.0,
+      "completions/max_terminated_length": 763.0,
+      "completions/mean_length": 421.8839416503906,
+      "completions/mean_terminated_length": 421.8839416503906,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.04023729687903,
+      "grad_norm": 0.7737622857093811,
+      "kl": 0.1016845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 116438546.0,
+      "reward": 1.28125,
+      "reward_std": 0.14772114157676697,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.3180830180644989,
+      "step": 1008
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 658.0,
+      "completions/max_terminated_length": 658.0,
+      "completions/mean_length": 402.9910888671875,
+      "completions/mean_terminated_length": 402.9910888671875,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.041269022440031,
+      "grad_norm": 0.7403515577316284,
+      "kl": 0.1068115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 116546933.0,
+      "reward": 1.2687500715255737,
+      "reward_std": 0.08907577395439148,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26874998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2895418107509613,
+      "step": 1009
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 809.0,
+      "completions/max_terminated_length": 809.0,
+      "completions/mean_length": 407.7232360839844,
+      "completions/mean_terminated_length": 407.7232360839844,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 1.0423007480010318,
+      "grad_norm": 0.7900381684303284,
+      "kl": 0.1107177734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0003,
+      "num_tokens": 116658636.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.12463202327489853,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3078185021877289,
+      "step": 1010
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 845.0,
+      "completions/max_terminated_length": 845.0,
+      "completions/mean_length": 424.1785888671875,
+      "completions/mean_terminated_length": 424.1785888671875,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.0433324735620324,
+      "grad_norm": 0.6202005743980408,
+      "kl": 0.097412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 116775815.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.09745357185602188,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.31515759229660034,
+      "step": 1011
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 740.0,
+      "completions/max_terminated_length": 740.0,
+      "completions/mean_length": 428.9285888671875,
+      "completions/mean_terminated_length": 428.9285888671875,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.0443641991230332,
+      "grad_norm": 0.9668126106262207,
+      "kl": 0.099365234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0018,
+      "num_tokens": 116894856.0,
+      "reward": 1.25,
+      "reward_std": 0.1951034516096115,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.3060206472873688,
+      "step": 1012
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 959.0,
+      "completions/max_terminated_length": 959.0,
+      "completions/mean_length": 440.3839416503906,
+      "completions/mean_terminated_length": 440.3839416503906,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.045395924684034,
+      "grad_norm": 0.7224416732788086,
+      "kl": 0.0953369140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0034,
+      "num_tokens": 116999829.0,
+      "reward": 1.390625238418579,
+      "reward_std": 0.13267162442207336,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.390625,
+      "rewards/curriculum_aware_reward_fn/std": 0.2729164659976959,
+      "step": 1013
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 927.0,
+      "completions/max_terminated_length": 927.0,
+      "completions/mean_length": 456.1964416503906,
+      "completions/mean_terminated_length": 456.1964416503906,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 1.046427650245035,
+      "grad_norm": 0.6741520166397095,
+      "kl": 0.0992431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0188,
+      "num_tokens": 117116037.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.10286222398281097,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.30997174978256226,
+      "step": 1014
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 714.0,
+      "completions/max_terminated_length": 714.0,
+      "completions/mean_length": 427.83929443359375,
+      "completions/mean_terminated_length": 427.83929443359375,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 1.0474593758060355,
+      "grad_norm": 0.8775485157966614,
+      "kl": 0.1038818359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0016,
+      "num_tokens": 117224638.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.19390401244163513,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.31839263439178467,
+      "step": 1015
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3500.0,
+      "completions/max_terminated_length": 3500.0,
+      "completions/mean_length": 471.83038330078125,
+      "completions/mean_terminated_length": 471.83038330078125,
+      "completions/min_length": 145.0,
+      "completions/min_terminated_length": 145.0,
+      "epoch": 1.0484911013670364,
+      "grad_norm": 0.8173244595527649,
+      "kl": 0.095458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0017,
+      "num_tokens": 117345691.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.1681494414806366,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3345697820186615,
+      "step": 1016
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 839.0,
+      "completions/max_terminated_length": 839.0,
+      "completions/mean_length": 437.33038330078125,
+      "completions/mean_terminated_length": 437.33038330078125,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 1.0495228269280372,
+      "grad_norm": 0.6973631381988525,
+      "kl": 0.0870361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 117458253.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.1287010908126831,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.31490740180015564,
+      "step": 1017
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 844.0,
+      "completions/max_terminated_length": 844.0,
+      "completions/mean_length": 437.8035888671875,
+      "completions/mean_terminated_length": 437.8035888671875,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.0505545524890378,
+      "grad_norm": 0.7553476691246033,
+      "kl": 0.1016845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 117575376.0,
+      "reward": 1.3062500953674316,
+      "reward_std": 0.16072815656661987,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31559503078460693,
+      "step": 1018
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1148.0,
+      "completions/max_terminated_length": 1148.0,
+      "completions/mean_length": 496.02679443359375,
+      "completions/mean_terminated_length": 496.02679443359375,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 1.0515862780500387,
+      "grad_norm": 0.6043642163276672,
+      "kl": 0.08447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 117692713.0,
+      "reward": 1.3406251668930054,
+      "reward_std": 0.1272299736738205,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.31326034665107727,
+      "step": 1019
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 772.0,
+      "completions/max_terminated_length": 772.0,
+      "completions/mean_length": 450.02679443359375,
+      "completions/mean_terminated_length": 450.02679443359375,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.0526180036110395,
+      "grad_norm": 0.71767258644104,
+      "kl": 0.095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 117816700.0,
+      "reward": 1.2468751668930054,
+      "reward_std": 0.08287324756383896,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687497317790985,
+      "rewards/curriculum_aware_reward_fn/std": 0.30677640438079834,
+      "step": 1020
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1677.0,
+      "completions/max_terminated_length": 1677.0,
+      "completions/mean_length": 450.3125305175781,
+      "completions/mean_terminated_length": 450.3125305175781,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 1.0536497291720401,
+      "grad_norm": 0.7819437384605408,
+      "kl": 0.1041259765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0219,
+      "num_tokens": 117928721.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.15827159583568573,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3114938735961914,
+      "step": 1021
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 836.0,
+      "completions/max_terminated_length": 836.0,
+      "completions/mean_length": 427.4910888671875,
+      "completions/mean_terminated_length": 427.4910888671875,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 1.054681454733041,
+      "grad_norm": 0.8454352617263794,
+      "kl": 0.094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 118039684.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.14444464445114136,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3141555190086365,
+      "step": 1022
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 771.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 440.70538330078125,
+      "completions/mean_terminated_length": 440.70538330078125,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 1.0557131802940418,
+      "grad_norm": 0.764153242111206,
+      "kl": 0.087646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 118149803.0,
+      "reward": 1.4250000715255737,
+      "reward_std": 0.15279839932918549,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474,
+      "step": 1023
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 816.0,
+      "completions/max_terminated_length": 816.0,
+      "completions/mean_length": 488.0357360839844,
+      "completions/mean_terminated_length": 488.0357360839844,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 1.0567449058550427,
+      "grad_norm": 0.7056334018707275,
+      "kl": 0.0885009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0292,
+      "num_tokens": 118285908.0,
+      "reward": 1.2906252145767212,
+      "reward_std": 0.14304517209529877,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.2853424549102783,
+      "step": 1024
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 988.0,
+      "completions/max_terminated_length": 988.0,
+      "completions/mean_length": 430.5714416503906,
+      "completions/mean_terminated_length": 430.5714416503906,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.0577766314160433,
+      "grad_norm": 0.797414243221283,
+      "kl": 0.096923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0307,
+      "num_tokens": 118399351.0,
+      "reward": 1.3875001668930054,
+      "reward_std": 0.16624128818511963,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38749998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.30935123562812805,
+      "step": 1025
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 426.5982360839844,
+      "completions/mean_terminated_length": 426.5982360839844,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 1.0588083569770441,
+      "grad_norm": 0.750198483467102,
+      "kl": 0.0948486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0085,
+      "num_tokens": 118510515.0,
+      "reward": 1.3562500476837158,
+      "reward_std": 0.14397019147872925,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728,
+      "step": 1026
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 804.0,
+      "completions/max_terminated_length": 804.0,
+      "completions/mean_length": 463.20538330078125,
+      "completions/mean_terminated_length": 463.20538330078125,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.059840082538045,
+      "grad_norm": 0.7545668482780457,
+      "kl": 0.09130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0045,
+      "num_tokens": 118621383.0,
+      "reward": 1.28125,
+      "reward_std": 0.14572392404079437,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3038880527019501,
+      "step": 1027
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 898.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 391.6875305175781,
+      "completions/mean_terminated_length": 391.6875305175781,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 1.0608718080990456,
+      "grad_norm": 0.717291533946991,
+      "kl": 0.0943603515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0002,
+      "num_tokens": 118734657.0,
+      "reward": 1.3031251430511475,
+      "reward_std": 0.13526920974254608,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3031249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.2914920449256897,
+      "step": 1028
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 756.0,
+      "completions/max_terminated_length": 756.0,
+      "completions/mean_length": 413.7321472167969,
+      "completions/mean_terminated_length": 413.7321472167969,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.0619035336600464,
+      "grad_norm": 0.7581539750099182,
+      "kl": 0.0992431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0194,
+      "num_tokens": 118846298.0,
+      "reward": 1.3906251192092896,
+      "reward_std": 0.13097161054611206,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.390625,
+      "rewards/curriculum_aware_reward_fn/std": 0.3280114233493805,
+      "step": 1029
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1135.0,
+      "completions/max_terminated_length": 1135.0,
+      "completions/mean_length": 493.58038330078125,
+      "completions/mean_terminated_length": 493.58038330078125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 1.0629352592210473,
+      "grad_norm": 0.6710352897644043,
+      "kl": 0.0892333984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0075,
+      "num_tokens": 118974442.0,
+      "reward": 1.2312501668930054,
+      "reward_std": 0.13597580790519714,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23124997317790985,
+      "rewards/curriculum_aware_reward_fn/std": 0.3028486967086792,
+      "step": 1030
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 818.0,
+      "completions/max_terminated_length": 818.0,
+      "completions/mean_length": 426.7857360839844,
+      "completions/mean_terminated_length": 426.7857360839844,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.0639669847820479,
+      "grad_norm": 0.6972105503082275,
+      "kl": 0.0975341796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0191,
+      "num_tokens": 119095896.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.09518764913082123,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.29231905937194824,
+      "step": 1031
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1136.0,
+      "completions/max_terminated_length": 1136.0,
+      "completions/mean_length": 487.7589416503906,
+      "completions/mean_terminated_length": 487.7589416503906,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 1.0649987103430487,
+      "grad_norm": 0.8523579835891724,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 119217277.0,
+      "reward": 1.234375,
+      "reward_std": 0.16692785918712616,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2343749850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.2714684009552002,
+      "step": 1032
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 811.0,
+      "completions/max_terminated_length": 811.0,
+      "completions/mean_length": 411.0625305175781,
+      "completions/mean_terminated_length": 411.0625305175781,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.0660304359040496,
+      "grad_norm": 0.7746363282203674,
+      "kl": 0.0946044921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0123,
+      "num_tokens": 119329711.0,
+      "reward": 1.347321629524231,
+      "reward_std": 0.17235055565834045,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.31509506702423096,
+      "step": 1033
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1049.0,
+      "completions/max_terminated_length": 1049.0,
+      "completions/mean_length": 443.4464416503906,
+      "completions/mean_terminated_length": 443.4464416503906,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "epoch": 1.0670621614650504,
+      "grad_norm": 0.7955756783485413,
+      "kl": 0.0926513671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0177,
+      "num_tokens": 119444519.0,
+      "reward": 1.3562501668930054,
+      "reward_std": 0.16780826449394226,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.32202377915382385,
+      "step": 1034
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 917.0,
+      "completions/max_terminated_length": 917.0,
+      "completions/mean_length": 464.6160888671875,
+      "completions/mean_terminated_length": 464.6160888671875,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 1.068093887026051,
+      "grad_norm": 0.7193375825881958,
+      "kl": 0.091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0262,
+      "num_tokens": 119563924.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.13995222747325897,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29538729786872864,
+      "step": 1035
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1595.0,
+      "completions/max_terminated_length": 1595.0,
+      "completions/mean_length": 491.5625305175781,
+      "completions/mean_terminated_length": 491.5625305175781,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.0691256125870519,
+      "grad_norm": 0.6852903366088867,
+      "kl": 0.084228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0172,
+      "num_tokens": 119688482.0,
+      "reward": 1.3062500953674316,
+      "reward_std": 0.11520179361104965,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.2862561047077179,
+      "step": 1036
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 966.0,
+      "completions/max_terminated_length": 966.0,
+      "completions/mean_length": 468.9464416503906,
+      "completions/mean_terminated_length": 468.9464416503906,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.0701573381480527,
+      "grad_norm": 0.6984896063804626,
+      "kl": 0.0897216796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0114,
+      "num_tokens": 119805411.0,
+      "reward": 1.3125,
+      "reward_std": 0.17140275239944458,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3267020583152771,
+      "step": 1037
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 980.0,
+      "completions/max_terminated_length": 980.0,
+      "completions/mean_length": 458.4285888671875,
+      "completions/mean_terminated_length": 458.4285888671875,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 1.0711890637090533,
+      "grad_norm": 0.7449373006820679,
+      "kl": 0.090576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 119919573.0,
+      "reward": 1.4468752145767212,
+      "reward_std": 0.17173954844474792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4468750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.36756327748298645,
+      "step": 1038
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 871.0,
+      "completions/max_terminated_length": 871.0,
+      "completions/mean_length": 480.46429443359375,
+      "completions/mean_terminated_length": 480.46429443359375,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.0722207892700542,
+      "grad_norm": 0.6695937514305115,
+      "kl": 0.0924072265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0048,
+      "num_tokens": 120043756.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.13723015785217285,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3071615993976593,
+      "step": 1039
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 981.0,
+      "completions/max_terminated_length": 981.0,
+      "completions/mean_length": 493.4285888671875,
+      "completions/mean_terminated_length": 493.4285888671875,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 1.073252514831055,
+      "grad_norm": 0.6117882132530212,
+      "kl": 0.1026611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 120172303.0,
+      "reward": 1.1937501430511475,
+      "reward_std": 0.1261538565158844,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19375000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.28515249490737915,
+      "step": 1040
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 849.0,
+      "completions/max_terminated_length": 849.0,
+      "completions/mean_length": 458.9107360839844,
+      "completions/mean_terminated_length": 458.9107360839844,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 1.0742842403920556,
+      "grad_norm": 0.619491696357727,
+      "kl": 0.0947265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0123,
+      "num_tokens": 120282844.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.11784723401069641,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3386533558368683,
+      "step": 1041
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1079.0,
+      "completions/max_terminated_length": 1079.0,
+      "completions/mean_length": 520.6964721679688,
+      "completions/mean_terminated_length": 520.6964721679688,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 1.0753159659530565,
+      "grad_norm": 0.47998204827308655,
+      "kl": 0.0845947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.009,
+      "num_tokens": 120414977.0,
+      "reward": 1.1937501430511475,
+      "reward_std": 0.08792105317115784,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19375000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.2773040235042572,
+      "step": 1042
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 737.0,
+      "completions/max_terminated_length": 737.0,
+      "completions/mean_length": 462.5089416503906,
+      "completions/mean_terminated_length": 462.5089416503906,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 1.0763476915140573,
+      "grad_norm": 62.07821273803711,
+      "kl": 10.761962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.1105,
+      "num_tokens": 120526782.0,
+      "reward": 1.303125023841858,
+      "reward_std": 0.11684229224920273,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31338611245155334,
+      "step": 1043
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 940.0,
+      "completions/max_terminated_length": 940.0,
+      "completions/mean_length": 488.7500305175781,
+      "completions/mean_terminated_length": 488.7500305175781,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.0773794170750581,
+      "grad_norm": 0.7104902267456055,
+      "kl": 0.0926513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0143,
+      "num_tokens": 120650962.0,
+      "reward": 1.2750000953674316,
+      "reward_std": 0.16270779073238373,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.2987210154533386,
+      "step": 1044
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1014.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 468.4464416503906,
+      "completions/mean_terminated_length": 468.4464416503906,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.0784111426360588,
+      "grad_norm": 0.7016794681549072,
+      "kl": 0.0914306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0162,
+      "num_tokens": 120775003.0,
+      "reward": 1.3812501430511475,
+      "reward_std": 0.12865351140499115,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3812499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.27616459131240845,
+      "step": 1045
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 445.982177734375,
+      "completions/mean_terminated_length": 445.982177734375,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 1.0794428681970596,
+      "grad_norm": 0.8315629363059998,
+      "kl": 0.104736328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0075,
+      "num_tokens": 120900882.0,
+      "reward": 1.3031251430511475,
+      "reward_std": 0.15639452636241913,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31688809394836426,
+      "step": 1046
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 834.0,
+      "completions/max_terminated_length": 834.0,
+      "completions/mean_length": 458.26788330078125,
+      "completions/mean_terminated_length": 458.26788330078125,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 1.0804745937580604,
+      "grad_norm": 0.672211766242981,
+      "kl": 0.09765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 121022026.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.12497055530548096,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.288519024848938,
+      "step": 1047
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 882.0,
+      "completions/max_terminated_length": 882.0,
+      "completions/mean_length": 449.58929443359375,
+      "completions/mean_terminated_length": 449.58929443359375,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 1.081506319319061,
+      "grad_norm": 0.7990990877151489,
+      "kl": 0.092529296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0028,
+      "num_tokens": 121135310.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.14558075368404388,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.30263715982437134,
+      "step": 1048
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 938.0,
+      "completions/max_terminated_length": 938.0,
+      "completions/mean_length": 457.27679443359375,
+      "completions/mean_terminated_length": 457.27679443359375,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 1.082538044880062,
+      "grad_norm": 0.8014538288116455,
+      "kl": 0.0947265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0049,
+      "num_tokens": 121256840.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.1600000113248825,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34999996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.2933957576751709,
+      "step": 1049
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 438.5625305175781,
+      "completions/mean_terminated_length": 438.5625305175781,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 1.0835697704410627,
+      "grad_norm": 0.8739475607872009,
+      "kl": 0.1004638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0138,
+      "num_tokens": 121374825.0,
+      "reward": 1.4937502145767212,
+      "reward_std": 0.18104833364486694,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4937499463558197,
+      "rewards/curriculum_aware_reward_fn/std": 0.27616459131240845,
+      "step": 1050
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 942.0,
+      "completions/max_terminated_length": 942.0,
+      "completions/mean_length": 457.0982360839844,
+      "completions/mean_terminated_length": 457.0982360839844,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 1.0846014960020633,
+      "grad_norm": 0.7015742659568787,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0177,
+      "num_tokens": 121490084.0,
+      "reward": 1.4437501430511475,
+      "reward_std": 0.13059858977794647,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44374996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31509506702423096,
+      "step": 1051
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1236.0,
+      "completions/max_terminated_length": 1236.0,
+      "completions/mean_length": 550.5535888671875,
+      "completions/mean_terminated_length": 550.5535888671875,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 1.0856332215630642,
+      "grad_norm": 0.6628881692886353,
+      "kl": 0.0919189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 121630804.0,
+      "reward": 1.262946605682373,
+      "reward_std": 0.11178959906101227,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.31413981318473816,
+      "step": 1052
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1330.0,
+      "completions/max_terminated_length": 1330.0,
+      "completions/mean_length": 466.1160888671875,
+      "completions/mean_terminated_length": 466.1160888671875,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 1.086664947124065,
+      "grad_norm": 0.8220956921577454,
+      "kl": 0.10546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 121751539.0,
+      "reward": 1.2598215341567993,
+      "reward_std": 0.2078828513622284,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26874998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.30440643429756165,
+      "step": 1053
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1585.0,
+      "completions/max_terminated_length": 1585.0,
+      "completions/mean_length": 473.20538330078125,
+      "completions/mean_terminated_length": 473.20538330078125,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 1.0876966726850659,
+      "grad_norm": 0.8058013319969177,
+      "kl": 0.100830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 121871437.0,
+      "reward": 1.3035714626312256,
+      "reward_std": 0.18695953488349915,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.28714969754219055,
+      "step": 1054
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 423.8214416503906,
+      "completions/mean_terminated_length": 423.8214416503906,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 1.0887283982460665,
+      "grad_norm": 0.7489458322525024,
+      "kl": 0.1041259765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0109,
+      "num_tokens": 121995744.0,
+      "reward": 1.4004465341567993,
+      "reward_std": 0.16618023812770844,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40937498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552,
+      "step": 1055
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1536.0,
+      "completions/max_terminated_length": 1536.0,
+      "completions/mean_length": 549.3928833007812,
+      "completions/mean_terminated_length": 549.3928833007812,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 1.0897601238070673,
+      "grad_norm": 0.5715673565864563,
+      "kl": 0.090087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0175,
+      "num_tokens": 122131665.0,
+      "reward": 1.2281250953674316,
+      "reward_std": 0.09870258718729019,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.27678829431533813,
+      "step": 1056
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 941.0,
+      "completions/max_terminated_length": 941.0,
+      "completions/mean_length": 477.794677734375,
+      "completions/mean_terminated_length": 477.794677734375,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 1.0907918493680682,
+      "grad_norm": 0.6655464172363281,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 122243201.0,
+      "reward": 1.2517858743667603,
+      "reward_std": 0.17265519499778748,
+      "rewards/code_format_reward/mean": 0.9642857313156128,
+      "rewards/code_format_reward/std": 0.18641091883182526,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.319381445646286,
+      "step": 1057
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1749.0,
+      "completions/max_terminated_length": 1749.0,
+      "completions/mean_length": 462.4375305175781,
+      "completions/mean_terminated_length": 462.4375305175781,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.0918235749290688,
+      "grad_norm": 0.7837157845497131,
+      "kl": 0.100341796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 122365995.0,
+      "reward": 1.3571429252624512,
+      "reward_std": 0.21329350769519806,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3034338057041168,
+      "step": 1058
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1285.0,
+      "completions/max_terminated_length": 1285.0,
+      "completions/mean_length": 478.46429443359375,
+      "completions/mean_terminated_length": 478.46429443359375,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.0928553004900696,
+      "grad_norm": 0.721355140209198,
+      "kl": 0.10205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0227,
+      "num_tokens": 122485243.0,
+      "reward": 1.3441966772079468,
+      "reward_std": 0.1735697239637375,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.31688809394836426,
+      "step": 1059
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1478.0,
+      "completions/max_terminated_length": 1478.0,
+      "completions/mean_length": 516.1785888671875,
+      "completions/mean_terminated_length": 516.1785888671875,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.0938870260510705,
+      "grad_norm": 0.7072334885597229,
+      "kl": 0.093505859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0312,
+      "num_tokens": 122614861.0,
+      "reward": 1.21875,
+      "reward_std": 0.15838447213172913,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21875,
+      "rewards/curriculum_aware_reward_fn/std": 0.27444660663604736,
+      "step": 1060
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1273.0,
+      "completions/max_terminated_length": 1273.0,
+      "completions/mean_length": 475.2232360839844,
+      "completions/mean_terminated_length": 475.2232360839844,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 1.094918751612071,
+      "grad_norm": 0.713330864906311,
+      "kl": 0.1046142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0283,
+      "num_tokens": 122733942.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.11566664278507233,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31874996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31005120277404785,
+      "step": 1061
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 960.0,
+      "completions/max_terminated_length": 960.0,
+      "completions/mean_length": 429.7589416503906,
+      "completions/mean_terminated_length": 429.7589416503906,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 1.095950477173072,
+      "grad_norm": 0.749390721321106,
+      "kl": 0.1058349609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0053,
+      "num_tokens": 122845831.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.15194597840309143,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.31688809394836426,
+      "step": 1062
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1335.0,
+      "completions/max_terminated_length": 1335.0,
+      "completions/mean_length": 467.26788330078125,
+      "completions/mean_terminated_length": 467.26788330078125,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "epoch": 1.0969822027340728,
+      "grad_norm": 0.8033453226089478,
+      "kl": 0.1021728515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 122960836.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.17109976708889008,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.2997746765613556,
+      "step": 1063
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1742.0,
+      "completions/max_terminated_length": 1742.0,
+      "completions/mean_length": 470.5089416503906,
+      "completions/mean_terminated_length": 470.5089416503906,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "epoch": 1.0980139282950736,
+      "grad_norm": 0.6588408350944519,
+      "kl": 0.0997314453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0152,
+      "num_tokens": 123087166.0,
+      "reward": 1.2250001430511475,
+      "reward_std": 0.12828277051448822,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22499999403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.26097530126571655,
+      "step": 1064
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 909.0,
+      "completions/max_terminated_length": 909.0,
+      "completions/mean_length": 419.01788330078125,
+      "completions/mean_terminated_length": 419.01788330078125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.0990456538560742,
+      "grad_norm": 0.5486484169960022,
+      "kl": 0.10107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0128,
+      "num_tokens": 123196354.0,
+      "reward": 1.2375000715255737,
+      "reward_std": 0.11234258115291595,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23749998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3964652419090271,
+      "step": 1065
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 807.0,
+      "completions/max_terminated_length": 807.0,
+      "completions/mean_length": 418.40179443359375,
+      "completions/mean_terminated_length": 418.40179443359375,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 1.100077379417075,
+      "grad_norm": 0.8537197709083557,
+      "kl": 0.1011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 123307951.0,
+      "reward": 1.3441966772079468,
+      "reward_std": 0.17055167257785797,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3062620460987091,
+      "step": 1066
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 426.20538330078125,
+      "completions/mean_terminated_length": 426.20538330078125,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 1.101109104978076,
+      "grad_norm": 0.5966207981109619,
+      "kl": 0.09375,
+      "learning_rate": 1e-06,
+      "loss": 0.0033,
+      "num_tokens": 123415543.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.10239209979772568,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378,
+      "step": 1067
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 951.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 473.4732360839844,
+      "completions/mean_terminated_length": 473.4732360839844,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 1.1021408305390765,
+      "grad_norm": 0.6321438550949097,
+      "kl": 0.095947265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0015,
+      "num_tokens": 123542415.0,
+      "reward": 1.2906250953674316,
+      "reward_std": 0.15328530967235565,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552,
+      "step": 1068
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 970.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 418.8035888671875,
+      "completions/mean_terminated_length": 418.8035888671875,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 1.1031725561000774,
+      "grad_norm": 0.7372820973396301,
+      "kl": 0.092529296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0201,
+      "num_tokens": 123659993.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.18130964040756226,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.33266472816467285,
+      "step": 1069
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1124.0,
+      "completions/max_terminated_length": 1124.0,
+      "completions/mean_length": 421.2500305175781,
+      "completions/mean_terminated_length": 421.2500305175781,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.1042042816610782,
+      "grad_norm": 0.7741332054138184,
+      "kl": 0.1090087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 123777187.0,
+      "reward": 1.3035715818405151,
+      "reward_std": 0.1366921067237854,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3267020583152771,
+      "step": 1070
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 868.0,
+      "completions/max_terminated_length": 868.0,
+      "completions/mean_length": 491.6607360839844,
+      "completions/mean_terminated_length": 491.6607360839844,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 1.105236007222079,
+      "grad_norm": 0.6807509064674377,
+      "kl": 0.095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0131,
+      "num_tokens": 123906740.0,
+      "reward": 1.3093750476837158,
+      "reward_std": 0.20825700461864471,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.34442347288131714,
+      "step": 1071
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 921.0,
+      "completions/max_terminated_length": 921.0,
+      "completions/mean_length": 449.08038330078125,
+      "completions/mean_terminated_length": 449.08038330078125,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.1062677327830797,
+      "grad_norm": 0.7984329462051392,
+      "kl": 0.0904541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0206,
+      "num_tokens": 124021262.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.1465483158826828,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29538729786872864,
+      "step": 1072
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 653.0,
+      "completions/max_terminated_length": 653.0,
+      "completions/mean_length": 408.8125305175781,
+      "completions/mean_terminated_length": 408.8125305175781,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.1072994583440805,
+      "grad_norm": 0.6433510184288025,
+      "kl": 0.0985107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 124138218.0,
+      "reward": 1.3406251668930054,
+      "reward_std": 0.10084687173366547,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.33040592074394226,
+      "step": 1073
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 762.0,
+      "completions/max_terminated_length": 762.0,
+      "completions/mean_length": 449.169677734375,
+      "completions/mean_terminated_length": 449.169677734375,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 1.1083311839050813,
+      "grad_norm": 0.671330451965332,
+      "kl": 0.0992431640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0001,
+      "num_tokens": 124246324.0,
+      "reward": 1.2062500715255737,
+      "reward_std": 0.11124001443386078,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20624998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.291711688041687,
+      "step": 1074
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 951.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 467.982177734375,
+      "completions/mean_terminated_length": 467.982177734375,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.109362909466082,
+      "grad_norm": 0.731284499168396,
+      "kl": 0.091552734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 124362501.0,
+      "reward": 1.3937503099441528,
+      "reward_std": 0.1432369202375412,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29759734869003296,
+      "step": 1075
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1376.0,
+      "completions/max_terminated_length": 1376.0,
+      "completions/mean_length": 439.544677734375,
+      "completions/mean_terminated_length": 439.544677734375,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 1.1103946350270828,
+      "grad_norm": 0.7016761898994446,
+      "kl": 0.099365234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 124479735.0,
+      "reward": 1.325446605682373,
+      "reward_std": 0.12540650367736816,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31998246908187866,
+      "step": 1076
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1158.0,
+      "completions/mean_length": 470.46429443359375,
+      "completions/mean_terminated_length": 437.80181884765625,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 1.1114263605880836,
+      "grad_norm": 0.8439191579818726,
+      "kl": 0.105224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0468,
+      "num_tokens": 124603386.0,
+      "reward": 1.2410714626312256,
+      "reward_std": 0.19769737124443054,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.2950034439563751,
+      "step": 1077
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1081.0,
+      "completions/max_terminated_length": 1081.0,
+      "completions/mean_length": 436.6696472167969,
+      "completions/mean_terminated_length": 436.6696472167969,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 1.1124580861490843,
+      "grad_norm": 0.7611454129219055,
+      "kl": 0.1112060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 124723602.0,
+      "reward": 1.343750238418579,
+      "reward_std": 0.1612805426120758,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.2857048511505127,
+      "step": 1078
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1553.0,
+      "completions/max_terminated_length": 1553.0,
+      "completions/mean_length": 460.9107360839844,
+      "completions/mean_terminated_length": 460.9107360839844,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 1.113489811710085,
+      "grad_norm": 0.7447132468223572,
+      "kl": 0.1043701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0229,
+      "num_tokens": 124836908.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.14876073598861694,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3371369242668152,
+      "step": 1079
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1044.0,
+      "completions/max_terminated_length": 1044.0,
+      "completions/mean_length": 474.58038330078125,
+      "completions/mean_terminated_length": 474.58038330078125,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 1.114521537271086,
+      "grad_norm": 0.6706196665763855,
+      "kl": 0.09521484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0172,
+      "num_tokens": 124957415.0,
+      "reward": 1.1812500953674316,
+      "reward_std": 0.12245100736618042,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.23482069373130798,
+      "step": 1080
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1071.0,
+      "completions/max_terminated_length": 1071.0,
+      "completions/mean_length": 454.8482360839844,
+      "completions/mean_terminated_length": 454.8482360839844,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 1.1155532628320866,
+      "grad_norm": 0.7702577114105225,
+      "kl": 0.10888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 125075964.0,
+      "reward": 1.3062502145767212,
+      "reward_std": 0.14353716373443604,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30625003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.29759737849235535,
+      "step": 1081
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1220.0,
+      "completions/max_terminated_length": 1220.0,
+      "completions/mean_length": 472.83929443359375,
+      "completions/mean_terminated_length": 472.83929443359375,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 1.1165849883930874,
+      "grad_norm": 0.7469330430030823,
+      "kl": 0.1016845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.039,
+      "num_tokens": 125196297.0,
+      "reward": 1.2718751430511475,
+      "reward_std": 0.15673062205314636,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.2960537075996399,
+      "step": 1082
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 425.40179443359375,
+      "completions/mean_terminated_length": 425.40179443359375,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.1176167139540882,
+      "grad_norm": 0.7289387583732605,
+      "kl": 0.104248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0135,
+      "num_tokens": 125308650.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.16956450045108795,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3359658122062683,
+      "step": 1083
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1262.0,
+      "completions/max_terminated_length": 1262.0,
+      "completions/mean_length": 470.9285888671875,
+      "completions/mean_terminated_length": 470.9285888671875,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 1.118648439515089,
+      "grad_norm": 0.7212175130844116,
+      "kl": 0.1165771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0327,
+      "num_tokens": 125434313.0,
+      "reward": 1.2691967487335205,
+      "reward_std": 0.16892673075199127,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.32255885004997253,
+      "step": 1084
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1064.0,
+      "completions/max_terminated_length": 1064.0,
+      "completions/mean_length": 489.5089416503906,
+      "completions/mean_terminated_length": 489.5089416503906,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.1196801650760897,
+      "grad_norm": 0.7285465598106384,
+      "kl": 0.095458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0382,
+      "num_tokens": 125545943.0,
+      "reward": 1.2250001430511475,
+      "reward_std": 0.15566104650497437,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.27737507224082947,
+      "step": 1085
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2961.0,
+      "completions/max_terminated_length": 2961.0,
+      "completions/mean_length": 472.9285888671875,
+      "completions/mean_terminated_length": 472.9285888671875,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.1207118906370905,
+      "grad_norm": 0.8245320320129395,
+      "kl": 0.110595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0221,
+      "num_tokens": 125676408.0,
+      "reward": 1.2004464864730835,
+      "reward_std": 0.23227369785308838,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20937499403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.28754404187202454,
+      "step": 1086
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1224.0,
+      "completions/max_terminated_length": 1224.0,
+      "completions/mean_length": 438.2410888671875,
+      "completions/mean_terminated_length": 438.2410888671875,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.1217436161980914,
+      "grad_norm": 0.5570046305656433,
+      "kl": 0.1021728515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0165,
+      "num_tokens": 125790745.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.0733615979552269,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.4082086384296417,
+      "step": 1087
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1317.0,
+      "completions/max_terminated_length": 1317.0,
+      "completions/mean_length": 488.6250305175781,
+      "completions/mean_terminated_length": 488.6250305175781,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.122775341759092,
+      "grad_norm": 0.7493662238121033,
+      "kl": 0.102294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 125925019.0,
+      "reward": 1.28125,
+      "reward_std": 0.14943061769008636,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.32153382897377014,
+      "step": 1088
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 466.40179443359375,
+      "completions/mean_terminated_length": 466.40179443359375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 1.1238070673200928,
+      "grad_norm": 0.8465061783790588,
+      "kl": 0.1046142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0194,
+      "num_tokens": 126049384.0,
+      "reward": 1.359375238418579,
+      "reward_std": 0.2215120792388916,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.316763699054718,
+      "step": 1089
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1055.0,
+      "completions/max_terminated_length": 1055.0,
+      "completions/mean_length": 463.5357360839844,
+      "completions/mean_terminated_length": 463.5357360839844,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.1248387928810937,
+      "grad_norm": 0.6302226781845093,
+      "kl": 0.0997314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0197,
+      "num_tokens": 126165609.0,
+      "reward": 1.2973216772079468,
+      "reward_std": 0.11076794564723969,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3120785355567932,
+      "step": 1090
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 673.0,
+      "completions/max_terminated_length": 673.0,
+      "completions/mean_length": 399.1875305175781,
+      "completions/mean_terminated_length": 399.1875305175781,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.1258705184420945,
+      "grad_norm": 0.7528978586196899,
+      "kl": 0.11376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0048,
+      "num_tokens": 126273871.0,
+      "reward": 1.3937500715255737,
+      "reward_std": 0.12978744506835938,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.32251301407814026,
+      "step": 1091
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 976.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 437.52679443359375,
+      "completions/mean_terminated_length": 437.52679443359375,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.1269022440030951,
+      "grad_norm": 0.81932133436203,
+      "kl": 0.1043701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 126389619.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.15330146253108978,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29391586780548096,
+      "step": 1092
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 879.0,
+      "completions/max_terminated_length": 879.0,
+      "completions/mean_length": 424.6875305175781,
+      "completions/mean_terminated_length": 424.6875305175781,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 1.127933969564096,
+      "grad_norm": 0.7560570240020752,
+      "kl": 0.108154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0196,
+      "num_tokens": 126497923.0,
+      "reward": 1.3000000715255737,
+      "reward_std": 0.1389334499835968,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.33173036575317383,
+      "step": 1093
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1163.0,
+      "completions/max_terminated_length": 1163.0,
+      "completions/mean_length": 490.2500305175781,
+      "completions/mean_terminated_length": 490.2500305175781,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 1.1289656951250968,
+      "grad_norm": 0.5553421378135681,
+      "kl": 0.0921630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 126621102.0,
+      "reward": 1.2156251668930054,
+      "reward_std": 0.1150357574224472,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.29431790113449097,
+      "step": 1094
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 850.0,
+      "completions/max_terminated_length": 850.0,
+      "completions/mean_length": 465.0000305175781,
+      "completions/mean_terminated_length": 465.0000305175781,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.1299974206860974,
+      "grad_norm": 0.5771997570991516,
+      "kl": 0.104248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 126745716.0,
+      "reward": 1.3218752145767212,
+      "reward_std": 0.10599697381258011,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.34567996859550476,
+      "step": 1095
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 751.0,
+      "completions/max_terminated_length": 751.0,
+      "completions/mean_length": 448.8839416503906,
+      "completions/mean_terminated_length": 448.8839416503906,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.1310291462470983,
+      "grad_norm": 0.6750321388244629,
+      "kl": 0.112548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 126864234.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.12350915372371674,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3005625307559967,
+      "step": 1096
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 448.0982360839844,
+      "completions/mean_terminated_length": 448.0982360839844,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.1320608718080991,
+      "grad_norm": 0.7752856016159058,
+      "kl": 0.105712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0235,
+      "num_tokens": 126983042.0,
+      "reward": 1.2531250715255737,
+      "reward_std": 0.15456053614616394,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.30882522463798523,
+      "step": 1097
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1640.0,
+      "completions/max_terminated_length": 1640.0,
+      "completions/mean_length": 518.357177734375,
+      "completions/mean_terminated_length": 518.357177734375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.1330925973690997,
+      "grad_norm": 0.6677262187004089,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0171,
+      "num_tokens": 127120947.0,
+      "reward": 1.2125000953674316,
+      "reward_std": 0.16567465662956238,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.29845699667930603,
+      "step": 1098
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 746.0,
+      "completions/max_terminated_length": 746.0,
+      "completions/mean_length": 428.7232360839844,
+      "completions/mean_terminated_length": 428.7232360839844,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 1.1341243229301006,
+      "grad_norm": 0.8252878785133362,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 127239447.0,
+      "reward": 1.2843750715255737,
+      "reward_std": 0.15611320734024048,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.2991000711917877,
+      "step": 1099
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 831.0,
+      "completions/max_terminated_length": 831.0,
+      "completions/mean_length": 451.3035888671875,
+      "completions/mean_terminated_length": 451.3035888671875,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 1.1351560484911014,
+      "grad_norm": 0.7429160475730896,
+      "kl": 0.1077880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0243,
+      "num_tokens": 127356577.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.1438807249069214,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3159071207046509,
+      "step": 1100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 724.0,
+      "completions/max_terminated_length": 724.0,
+      "completions/mean_length": 416.5535888671875,
+      "completions/mean_terminated_length": 416.5535888671875,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 1.136187774052102,
+      "grad_norm": 1.1605981588363647,
+      "kl": 0.179931640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 127472601.0,
+      "reward": 1.3343751430511475,
+      "reward_std": 0.12120731920003891,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.32680758833885193,
+      "step": 1101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 774.0,
+      "completions/max_terminated_length": 774.0,
+      "completions/mean_length": 434.2589416503906,
+      "completions/mean_terminated_length": 434.2589416503906,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 1.1372194996131029,
+      "grad_norm": 0.8007761836051941,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 127584574.0,
+      "reward": 1.2375000715255737,
+      "reward_std": 0.12666957080364227,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23749998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.28272324800491333,
+      "step": 1102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1192.0,
+      "completions/max_terminated_length": 1192.0,
+      "completions/mean_length": 466.794677734375,
+      "completions/mean_terminated_length": 466.794677734375,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 1.1382512251741037,
+      "grad_norm": 0.746719241142273,
+      "kl": 0.109619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0054,
+      "num_tokens": 127712301.0,
+      "reward": 1.3035715818405151,
+      "reward_std": 0.1359844207763672,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.31987470388412476,
+      "step": 1103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1111.0,
+      "completions/max_terminated_length": 1111.0,
+      "completions/mean_length": 440.1339416503906,
+      "completions/mean_terminated_length": 440.1339416503906,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 1.1392829507351045,
+      "grad_norm": 0.7280793786048889,
+      "kl": 0.105712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 127819759.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.1580439656972885,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3098445534706116,
+      "step": 1104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1057.0,
+      "completions/max_terminated_length": 1057.0,
+      "completions/mean_length": 476.0535888671875,
+      "completions/mean_terminated_length": 476.0535888671875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 1.1403146762961052,
+      "grad_norm": 0.7410470247268677,
+      "kl": 0.0887451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 127929350.0,
+      "reward": 1.2937501668930054,
+      "reward_std": 0.15814347565174103,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.32397621870040894,
+      "step": 1105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1237.0,
+      "completions/max_terminated_length": 1237.0,
+      "completions/mean_length": 480.1785888671875,
+      "completions/mean_terminated_length": 480.1785888671875,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 1.141346401857106,
+      "grad_norm": 0.7002902626991272,
+      "kl": 0.0968017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 128052206.0,
+      "reward": 1.262500286102295,
+      "reward_std": 0.15251708030700684,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26250001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.29527053236961365,
+      "step": 1106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1026.0,
+      "completions/max_terminated_length": 1026.0,
+      "completions/mean_length": 426.76788330078125,
+      "completions/mean_terminated_length": 426.76788330078125,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 1.1423781274181068,
+      "grad_norm": 0.8631582260131836,
+      "kl": 0.0955810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0142,
+      "num_tokens": 128162507.0,
+      "reward": 1.4031251668930054,
+      "reward_std": 0.19521509110927582,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40312498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.29791173338890076,
+      "step": 1107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1116.0,
+      "completions/max_terminated_length": 1116.0,
+      "completions/mean_length": 456.669677734375,
+      "completions/mean_terminated_length": 456.669677734375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 1.1434098529791075,
+      "grad_norm": 0.6821380853652954,
+      "kl": 0.0904541015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0024,
+      "num_tokens": 128281260.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.1475609540939331,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3271692097187042,
+      "step": 1108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 800.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 420.45538330078125,
+      "completions/mean_terminated_length": 420.45538330078125,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.1444415785401083,
+      "grad_norm": 0.7330096960067749,
+      "kl": 0.10400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.008,
+      "num_tokens": 128395376.0,
+      "reward": 1.4156252145767212,
+      "reward_std": 0.16615553200244904,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4156249463558197,
+      "rewards/curriculum_aware_reward_fn/std": 0.3204748034477234,
+      "step": 1109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 961.0,
+      "completions/max_terminated_length": 961.0,
+      "completions/mean_length": 438.5446472167969,
+      "completions/mean_terminated_length": 438.5446472167969,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 1.1454733041011091,
+      "grad_norm": 0.6928600668907166,
+      "kl": 0.096923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0056,
+      "num_tokens": 128512547.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.11022058129310608,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3062620460987091,
+      "step": 1110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 862.0,
+      "completions/max_terminated_length": 862.0,
+      "completions/mean_length": 437.52679443359375,
+      "completions/mean_terminated_length": 437.52679443359375,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "epoch": 1.14650502966211,
+      "grad_norm": 0.6906600594520569,
+      "kl": 0.09228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 128622716.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.10169928520917892,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.29896828532218933,
+      "step": 1111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1141.0,
+      "completions/max_terminated_length": 1141.0,
+      "completions/mean_length": 480.357177734375,
+      "completions/mean_terminated_length": 480.357177734375,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 1.1475367552231106,
+      "grad_norm": 0.8089150190353394,
+      "kl": 0.10302734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0018,
+      "num_tokens": 128747160.0,
+      "reward": 1.2281250953674316,
+      "reward_std": 0.16455334424972534,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.27277201414108276,
+      "step": 1112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1058.0,
+      "completions/max_terminated_length": 1058.0,
+      "completions/mean_length": 465.46429443359375,
+      "completions/mean_terminated_length": 465.46429443359375,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.1485684807841114,
+      "grad_norm": 0.7459560632705688,
+      "kl": 0.09130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 128868095.0,
+      "reward": 1.3218750953674316,
+      "reward_std": 0.15609592199325562,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218749463558197,
+      "rewards/curriculum_aware_reward_fn/std": 0.32596227526664734,
+      "step": 1113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 489.3035888671875,
+      "completions/mean_terminated_length": 489.3035888671875,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 1.1496002063451123,
+      "grad_norm": 0.7976208329200745,
+      "kl": 0.09814453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0061,
+      "num_tokens": 128986842.0,
+      "reward": 1.2781251668930054,
+      "reward_std": 0.1544256955385208,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.2663382887840271,
+      "step": 1114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1825.0,
+      "completions/max_terminated_length": 1825.0,
+      "completions/mean_length": 467.4285888671875,
+      "completions/mean_terminated_length": 467.4285888671875,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.150631931906113,
+      "grad_norm": 0.6780887246131897,
+      "kl": 0.0997314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 129100472.0,
+      "reward": 1.296875,
+      "reward_std": 0.13180279731750488,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3193660080432892,
+      "step": 1115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 799.0,
+      "completions/max_terminated_length": 799.0,
+      "completions/mean_length": 450.52679443359375,
+      "completions/mean_terminated_length": 450.52679443359375,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.1516636574671137,
+      "grad_norm": 0.8171302676200867,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0324,
+      "num_tokens": 129214050.0,
+      "reward": 1.3812501430511475,
+      "reward_std": 0.19573992490768433,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38124996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3273649215698242,
+      "step": 1116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 787.0,
+      "completions/max_terminated_length": 787.0,
+      "completions/mean_length": 428.8482360839844,
+      "completions/mean_terminated_length": 428.8482360839844,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 1.1526953830281146,
+      "grad_norm": 0.8667630553245544,
+      "kl": 0.1043701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0108,
+      "num_tokens": 129325102.0,
+      "reward": 1.3531250953674316,
+      "reward_std": 0.1513831466436386,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.29525381326675415,
+      "step": 1117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 771.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 421.7500305175781,
+      "completions/mean_terminated_length": 421.7500305175781,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.1537271085891152,
+      "grad_norm": 1.1657435894012451,
+      "kl": 0.181884765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0083,
+      "num_tokens": 129433000.0,
+      "reward": 1.1968752145767212,
+      "reward_std": 0.11701443046331406,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2648543119430542,
+      "step": 1118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 704.0,
+      "completions/max_terminated_length": 704.0,
+      "completions/mean_length": 428.83929443359375,
+      "completions/mean_terminated_length": 428.83929443359375,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 1.154758834150116,
+      "grad_norm": 0.6912339329719543,
+      "kl": 0.1046142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 129538017.0,
+      "reward": 1.303125023841858,
+      "reward_std": 0.1794067919254303,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378,
+      "step": 1119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1067.0,
+      "completions/max_terminated_length": 1067.0,
+      "completions/mean_length": 479.6160888671875,
+      "completions/mean_terminated_length": 479.6160888671875,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.1557905597111169,
+      "grad_norm": 0.639657735824585,
+      "kl": 0.095458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0019,
+      "num_tokens": 129657219.0,
+      "reward": 1.265625,
+      "reward_std": 0.08700991421937943,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.265625,
+      "rewards/curriculum_aware_reward_fn/std": 0.30895283818244934,
+      "step": 1120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 754.0,
+      "completions/max_terminated_length": 754.0,
+      "completions/mean_length": 432.3750305175781,
+      "completions/mean_terminated_length": 432.3750305175781,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.1568222852721175,
+      "grad_norm": 0.8700076937675476,
+      "kl": 0.10205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0075,
+      "num_tokens": 129760644.0,
+      "reward": 1.372321605682373,
+      "reward_std": 0.23868943750858307,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3812499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3028486967086792,
+      "step": 1121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1218.0,
+      "completions/max_terminated_length": 1218.0,
+      "completions/mean_length": 449.9732360839844,
+      "completions/mean_terminated_length": 449.9732360839844,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 1.1578540108331183,
+      "grad_norm": 0.634154200553894,
+      "kl": 0.091552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 129867776.0,
+      "reward": 1.2879465818405151,
+      "reward_std": 0.14574995636940002,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.32956981658935547,
+      "step": 1122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 855.0,
+      "completions/max_terminated_length": 855.0,
+      "completions/mean_length": 455.40179443359375,
+      "completions/mean_terminated_length": 455.40179443359375,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 1.1588857363941192,
+      "grad_norm": 0.7961022853851318,
+      "kl": 0.1005859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0023,
+      "num_tokens": 129992561.0,
+      "reward": 1.278125286102295,
+      "reward_std": 0.17692962288856506,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3703407943248749,
+      "step": 1123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 907.0,
+      "completions/max_terminated_length": 907.0,
+      "completions/mean_length": 455.89288330078125,
+      "completions/mean_terminated_length": 455.89288330078125,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 1.15991746195512,
+      "grad_norm": 0.5796977877616882,
+      "kl": 0.0980224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0423,
+      "num_tokens": 130104493.0,
+      "reward": 1.453125238418579,
+      "reward_std": 0.09935219585895538,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.453125,
+      "rewards/curriculum_aware_reward_fn/std": 0.30315765738487244,
+      "step": 1124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 468.4732360839844,
+      "completions/mean_terminated_length": 468.4732360839844,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 1.1609491875161206,
+      "grad_norm": 0.7204039096832275,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0093,
+      "num_tokens": 130221229.0,
+      "reward": 1.2281252145767212,
+      "reward_std": 0.12669439613819122,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2807471752166748,
+      "step": 1125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 846.0,
+      "completions/max_terminated_length": 846.0,
+      "completions/mean_length": 444.0000305175781,
+      "completions/mean_terminated_length": 444.0000305175781,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.1619809130771215,
+      "grad_norm": 0.6680362224578857,
+      "kl": 0.0992431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 130342861.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.10159022361040115,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.38382449746131897,
+      "step": 1126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1049.0,
+      "completions/max_terminated_length": 1049.0,
+      "completions/mean_length": 510.0982360839844,
+      "completions/mean_terminated_length": 510.0982360839844,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.1630126386381223,
+      "grad_norm": 0.5835331082344055,
+      "kl": 0.0870361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0252,
+      "num_tokens": 130476047.0,
+      "reward": 1.3125,
+      "reward_std": 0.1379421055316925,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3233064115047455,
+      "step": 1127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 832.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 427.7232360839844,
+      "completions/mean_terminated_length": 427.7232360839844,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 1.164044364199123,
+      "grad_norm": 0.724087655544281,
+      "kl": 0.0877685546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0036,
+      "num_tokens": 130585853.0,
+      "reward": 1.3406251668930054,
+      "reward_std": 0.11424807459115982,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3025068938732147,
+      "step": 1128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1391.0,
+      "completions/max_terminated_length": 1391.0,
+      "completions/mean_length": 467.46429443359375,
+      "completions/mean_terminated_length": 467.46429443359375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.1650760897601238,
+      "grad_norm": 0.6006419658660889,
+      "kl": 0.0948486328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0135,
+      "num_tokens": 130704058.0,
+      "reward": 1.3656251430511475,
+      "reward_std": 0.11053341627120972,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36562496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3165147304534912,
+      "step": 1129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 903.0,
+      "completions/max_terminated_length": 903.0,
+      "completions/mean_length": 446.83929443359375,
+      "completions/mean_terminated_length": 446.83929443359375,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 1.1661078153211246,
+      "grad_norm": 0.7218096256256104,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 130818147.0,
+      "reward": 1.265625238418579,
+      "reward_std": 0.143123596906662,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.265625,
+      "rewards/curriculum_aware_reward_fn/std": 0.2867204546928406,
+      "step": 1130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 875.0,
+      "completions/max_terminated_length": 875.0,
+      "completions/mean_length": 389.6250305175781,
+      "completions/mean_terminated_length": 389.6250305175781,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 1.1671395408821255,
+      "grad_norm": 0.5740315318107605,
+      "kl": 0.1015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0139,
+      "num_tokens": 130919514.0,
+      "reward": 1.4406250715255737,
+      "reward_std": 0.12512782216072083,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3280114233493805,
+      "step": 1131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 909.0,
+      "completions/max_terminated_length": 909.0,
+      "completions/mean_length": 477.8035888671875,
+      "completions/mean_terminated_length": 477.8035888671875,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.168171266443126,
+      "grad_norm": 0.686428964138031,
+      "kl": 0.08837890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0018,
+      "num_tokens": 131040466.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.1683778315782547,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3570103049278259,
+      "step": 1132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 898.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 465.3125305175781,
+      "completions/mean_terminated_length": 465.3125305175781,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.169202992004127,
+      "grad_norm": 0.6112284064292908,
+      "kl": 0.089599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0064,
+      "num_tokens": 131163757.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.1343052089214325,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3080745041370392,
+      "step": 1133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1139.0,
+      "completions/max_terminated_length": 1139.0,
+      "completions/mean_length": 451.64288330078125,
+      "completions/mean_terminated_length": 451.64288330078125,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 1.1702347175651278,
+      "grad_norm": 0.605445384979248,
+      "kl": 0.0877685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0084,
+      "num_tokens": 131272287.0,
+      "reward": 1.3031251430511475,
+      "reward_std": 0.12129982560873032,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31338611245155334,
+      "step": 1134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 979.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 442.544677734375,
+      "completions/mean_terminated_length": 442.544677734375,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.1712664431261284,
+      "grad_norm": 0.6315911412239075,
+      "kl": 0.0882568359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 131384250.0,
+      "reward": 1.2875001430511475,
+      "reward_std": 0.15498583018779755,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3295847773551941,
+      "step": 1135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1045.0,
+      "completions/max_terminated_length": 1045.0,
+      "completions/mean_length": 429.51788330078125,
+      "completions/mean_terminated_length": 429.51788330078125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 1.1722981686871292,
+      "grad_norm": 0.6539668440818787,
+      "kl": 0.0860595703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0019,
+      "num_tokens": 131501355.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.16448518633842468,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3237784504890442,
+      "step": 1136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 751.0,
+      "completions/max_terminated_length": 751.0,
+      "completions/mean_length": 424.40179443359375,
+      "completions/mean_terminated_length": 424.40179443359375,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.17332989424813,
+      "grad_norm": 0.7477120757102966,
+      "kl": 0.0941162109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0238,
+      "num_tokens": 131618440.0,
+      "reward": 1.3593751192092896,
+      "reward_std": 0.13173705339431763,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.32704871892929077,
+      "step": 1137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 935.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 504.96429443359375,
+      "completions/mean_terminated_length": 504.96429443359375,
+      "completions/min_length": 286.0,
+      "completions/min_terminated_length": 286.0,
+      "epoch": 1.1743616198091307,
+      "grad_norm": 0.670097827911377,
+      "kl": 0.08349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 131740838.0,
+      "reward": 1.3191965818405151,
+      "reward_std": 0.15710267424583435,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.328125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3090803623199463,
+      "step": 1138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1451.0,
+      "completions/max_terminated_length": 1451.0,
+      "completions/mean_length": 507.232177734375,
+      "completions/mean_terminated_length": 507.232177734375,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.1753933453701315,
+      "grad_norm": 0.5765451192855835,
+      "kl": 0.0892333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0065,
+      "num_tokens": 131864582.0,
+      "reward": 1.3218752145767212,
+      "reward_std": 0.08672605454921722,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.40724197030067444,
+      "step": 1139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 812.0,
+      "completions/max_terminated_length": 812.0,
+      "completions/mean_length": 449.5089416503906,
+      "completions/mean_terminated_length": 449.5089416503906,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.1764250709311324,
+      "grad_norm": 0.6205755472183228,
+      "kl": 0.0936279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.012,
+      "num_tokens": 131986236.0,
+      "reward": 1.403125286102295,
+      "reward_std": 0.13763344287872314,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40312498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.29040831327438354,
+      "step": 1140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 767.0,
+      "completions/max_terminated_length": 767.0,
+      "completions/mean_length": 464.6160888671875,
+      "completions/mean_terminated_length": 464.6160888671875,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.177456796492133,
+      "grad_norm": 0.794878363609314,
+      "kl": 0.0904541015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 132106362.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.1489112824201584,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.30082470178604126,
+      "step": 1141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1085.0,
+      "completions/max_terminated_length": 1085.0,
+      "completions/mean_length": 484.5625305175781,
+      "completions/mean_terminated_length": 484.5625305175781,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.1784885220531338,
+      "grad_norm": 0.6890538334846497,
+      "kl": 0.0811767578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0144,
+      "num_tokens": 132228820.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.2188614159822464,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.31426528096199036,
+      "step": 1142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 957.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 487.107177734375,
+      "completions/mean_terminated_length": 487.107177734375,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.1795202476141347,
+      "grad_norm": 0.6225954294204712,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 132355595.0,
+      "reward": 1.2562501430511475,
+      "reward_std": 0.14356856048107147,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25624996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31157293915748596,
+      "step": 1143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 924.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 508.8035888671875,
+      "completions/mean_terminated_length": 508.8035888671875,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.1805519731751355,
+      "grad_norm": 0.6063207983970642,
+      "kl": 0.081298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0109,
+      "num_tokens": 132481216.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.16792182624340057,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.30067723989486694,
+      "step": 1144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 901.0,
+      "completions/max_terminated_length": 901.0,
+      "completions/mean_length": 506.58038330078125,
+      "completions/mean_terminated_length": 506.58038330078125,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 1.1815836987361361,
+      "grad_norm": 0.7089967727661133,
+      "kl": 0.086181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 132613077.0,
+      "reward": 1.2750000953674316,
+      "reward_std": 0.14475411176681519,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.2950034439563751,
+      "step": 1145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 866.0,
+      "completions/max_terminated_length": 866.0,
+      "completions/mean_length": 506.5625305175781,
+      "completions/mean_terminated_length": 506.5625305175781,
+      "completions/min_length": 299.0,
+      "completions/min_terminated_length": 299.0,
+      "epoch": 1.182615424297137,
+      "grad_norm": 0.7573201060295105,
+      "kl": 0.088134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0024,
+      "num_tokens": 132742346.0,
+      "reward": 1.2906250953674316,
+      "reward_std": 0.20178672671318054,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.29297566413879395,
+      "step": 1146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 747.0,
+      "completions/max_terminated_length": 747.0,
+      "completions/mean_length": 440.8660888671875,
+      "completions/mean_terminated_length": 440.8660888671875,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 1.1836471498581378,
+      "grad_norm": 0.7480193972587585,
+      "kl": 0.093017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0139,
+      "num_tokens": 132852147.0,
+      "reward": 1.3343751430511475,
+      "reward_std": 0.20899318158626556,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.32680758833885193,
+      "step": 1147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1013.0,
+      "completions/max_terminated_length": 1013.0,
+      "completions/mean_length": 496.732177734375,
+      "completions/mean_terminated_length": 496.732177734375,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "epoch": 1.1846788754191384,
+      "grad_norm": 0.6407890915870667,
+      "kl": 0.0936279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0205,
+      "num_tokens": 132985838.0,
+      "reward": 1.2375000715255737,
+      "reward_std": 0.17742672562599182,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23749999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.27879244089126587,
+      "step": 1148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 999.0,
+      "completions/max_terminated_length": 999.0,
+      "completions/mean_length": 507.39288330078125,
+      "completions/mean_terminated_length": 507.39288330078125,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 1.1857106009801393,
+      "grad_norm": 0.5715434551239014,
+      "kl": 0.0921630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 133107850.0,
+      "reward": 1.2723214626312256,
+      "reward_std": 0.1283775120973587,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.2965359389781952,
+      "step": 1149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 863.0,
+      "completions/max_terminated_length": 863.0,
+      "completions/mean_length": 489.51788330078125,
+      "completions/mean_terminated_length": 489.51788330078125,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 1.18674232654114,
+      "grad_norm": 0.674042284488678,
+      "kl": 0.1044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0028,
+      "num_tokens": 133230660.0,
+      "reward": 1.2312501668930054,
+      "reward_std": 0.18054217100143433,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23124997317790985,
+      "rewards/curriculum_aware_reward_fn/std": 0.31005120277404785,
+      "step": 1150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 963.0,
+      "completions/max_terminated_length": 963.0,
+      "completions/mean_length": 504.2857360839844,
+      "completions/mean_terminated_length": 504.2857360839844,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 1.187774052102141,
+      "grad_norm": 0.6435889601707458,
+      "kl": 0.1002197265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0073,
+      "num_tokens": 133355964.0,
+      "reward": 1.2593750953674316,
+      "reward_std": 0.16222849488258362,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2593750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.28863856196403503,
+      "step": 1151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 517.8035888671875,
+      "completions/mean_terminated_length": 517.8035888671875,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 1.1888057776631415,
+      "grad_norm": 0.630872905254364,
+      "kl": 0.093017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0268,
+      "num_tokens": 133491259.0,
+      "reward": 1.2750002145767212,
+      "reward_std": 0.16553469002246857,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.29872098565101624,
+      "step": 1152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 931.0,
+      "completions/max_terminated_length": 931.0,
+      "completions/mean_length": 518.6785888671875,
+      "completions/mean_terminated_length": 518.6785888671875,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 1.1898375032241424,
+      "grad_norm": 0.7435547709465027,
+      "kl": 0.08837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0086,
+      "num_tokens": 133623901.0,
+      "reward": 1.28125,
+      "reward_std": 0.20633579790592194,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3038880527019501,
+      "step": 1153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1219.0,
+      "completions/max_terminated_length": 1219.0,
+      "completions/mean_length": 507.5357360839844,
+      "completions/mean_terminated_length": 507.5357360839844,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 1.1908692287851432,
+      "grad_norm": 0.5446999073028564,
+      "kl": 0.0982666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0032,
+      "num_tokens": 133742863.0,
+      "reward": 1.3687502145767212,
+      "reward_std": 0.10711290687322617,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3687500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3249480128288269,
+      "step": 1154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 762.0,
+      "completions/max_terminated_length": 762.0,
+      "completions/mean_length": 473.794677734375,
+      "completions/mean_terminated_length": 473.794677734375,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 1.1919009543461438,
+      "grad_norm": 0.7982268929481506,
+      "kl": 0.09130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0023,
+      "num_tokens": 133861700.0,
+      "reward": 1.3062500953674316,
+      "reward_std": 0.23682567477226257,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3012829124927521,
+      "step": 1155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1189.0,
+      "completions/max_terminated_length": 1189.0,
+      "completions/mean_length": 514.1160888671875,
+      "completions/mean_terminated_length": 514.1160888671875,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 1.1929326799071447,
+      "grad_norm": 0.7393704056739807,
+      "kl": 0.0997314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0099,
+      "num_tokens": 133983289.0,
+      "reward": 1.3062500953674316,
+      "reward_std": 0.2049076408147812,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3049239218235016,
+      "step": 1156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 987.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 544.544677734375,
+      "completions/mean_terminated_length": 544.544677734375,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 1.1939644054681455,
+      "grad_norm": 0.7494214773178101,
+      "kl": 0.0885009765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0005,
+      "num_tokens": 134107782.0,
+      "reward": 1.2125000953674316,
+      "reward_std": 0.1954411119222641,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.2672431766986847,
+      "step": 1157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1025.0,
+      "completions/max_terminated_length": 1025.0,
+      "completions/mean_length": 550.5089721679688,
+      "completions/mean_terminated_length": 550.5089721679688,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 1.1949961310291464,
+      "grad_norm": 0.6580347418785095,
+      "kl": 0.0845947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0203,
+      "num_tokens": 134237185.0,
+      "reward": 1.2312501668930054,
+      "reward_std": 0.18060758709907532,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23124997317790985,
+      "rewards/curriculum_aware_reward_fn/std": 0.2840445339679718,
+      "step": 1158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1190.0,
+      "completions/max_terminated_length": 1190.0,
+      "completions/mean_length": 512.4464721679688,
+      "completions/mean_terminated_length": 512.4464721679688,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.196027856590147,
+      "grad_norm": 0.6041538715362549,
+      "kl": 0.091552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0187,
+      "num_tokens": 134362367.0,
+      "reward": 1.3562500476837158,
+      "reward_std": 0.15172560513019562,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35624998807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.31157293915748596,
+      "step": 1159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1382.0,
+      "completions/max_terminated_length": 1382.0,
+      "completions/mean_length": 489.9285888671875,
+      "completions/mean_terminated_length": 489.9285888671875,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 1.1970595821511478,
+      "grad_norm": 0.6599896550178528,
+      "kl": 0.107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0024,
+      "num_tokens": 134485258.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.165399968624115,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.33196792006492615,
+      "step": 1160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1643.0,
+      "completions/max_terminated_length": 1643.0,
+      "completions/mean_length": 527.1160888671875,
+      "completions/mean_terminated_length": 527.1160888671875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 1.1980913077121484,
+      "grad_norm": 0.6196821928024292,
+      "kl": 0.08740234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 134609682.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.13873064517974854,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3279062509536743,
+      "step": 1161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 907.0,
+      "completions/max_terminated_length": 907.0,
+      "completions/mean_length": 443.4375305175781,
+      "completions/mean_terminated_length": 443.4375305175781,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 1.1991230332731493,
+      "grad_norm": 0.7619950175285339,
+      "kl": 0.103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 134717580.0,
+      "reward": 1.328125,
+      "reward_std": 0.16038528084754944,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.328125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3230472207069397,
+      "step": 1162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1017.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 505.83038330078125,
+      "completions/mean_terminated_length": 505.83038330078125,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 1.2001547588341501,
+      "grad_norm": 0.6503728628158569,
+      "kl": 0.1016845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 134836596.0,
+      "reward": 1.3031251430511475,
+      "reward_std": 0.15563803911209106,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.3062620460987091,
+      "step": 1163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 976.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 492.02679443359375,
+      "completions/mean_terminated_length": 492.02679443359375,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 1.201186484395151,
+      "grad_norm": 0.5857566595077515,
+      "kl": 0.08642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 134955641.0,
+      "reward": 1.441071629524231,
+      "reward_std": 0.185373455286026,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44107145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.44210872054100037,
+      "step": 1164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1654.0,
+      "completions/max_terminated_length": 1654.0,
+      "completions/mean_length": 498.5714416503906,
+      "completions/mean_terminated_length": 498.5714416503906,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 1.2022182099561516,
+      "grad_norm": 0.6004607677459717,
+      "kl": 0.0880126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 135080144.0,
+      "reward": 1.3633930683135986,
+      "reward_std": 0.11078914254903793,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3633928596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.4917326867580414,
+      "step": 1165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 990.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 505.2589416503906,
+      "completions/mean_terminated_length": 505.2589416503906,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.2032499355171524,
+      "grad_norm": 0.5843690633773804,
+      "kl": 0.09912109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0093,
+      "num_tokens": 135206012.0,
+      "reward": 1.3232144117355347,
+      "reward_std": 0.14009526371955872,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32321426272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.3698629140853882,
+      "step": 1166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 748.0,
+      "completions/max_terminated_length": 748.0,
+      "completions/mean_length": 463.76788330078125,
+      "completions/mean_terminated_length": 463.76788330078125,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 1.2042816610781533,
+      "grad_norm": 0.7525951266288757,
+      "kl": 0.104248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0168,
+      "num_tokens": 135324230.0,
+      "reward": 1.4459823369979858,
+      "reward_std": 0.17760148644447327,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4459821581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.37145552039146423,
+      "step": 1167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1048.0,
+      "completions/max_terminated_length": 1048.0,
+      "completions/mean_length": 523.8928833007812,
+      "completions/mean_terminated_length": 523.8928833007812,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 1.2053133866391539,
+      "grad_norm": 0.630405604839325,
+      "kl": 0.078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 135443148.0,
+      "reward": 1.3714287281036377,
+      "reward_std": 0.1765255331993103,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38035711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.3761083781719208,
+      "step": 1168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1106.0,
+      "completions/max_terminated_length": 1106.0,
+      "completions/mean_length": 400.2857360839844,
+      "completions/mean_terminated_length": 400.2857360839844,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 1.2063451122001547,
+      "grad_norm": 0.7302508354187012,
+      "kl": 0.0921630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 135543560.0,
+      "reward": 1.5000003576278687,
+      "reward_std": 0.13220220804214478,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5000000596046448,
+      "rewards/curriculum_aware_reward_fn/std": 0.3808477818965912,
+      "step": 1169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 960.0,
+      "completions/max_terminated_length": 960.0,
+      "completions/mean_length": 493.9285888671875,
+      "completions/mean_terminated_length": 493.9285888671875,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 1.2073768377611556,
+      "grad_norm": 0.734142005443573,
+      "kl": 0.0927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 135665324.0,
+      "reward": 1.369642972946167,
+      "reward_std": 0.21060726046562195,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36964288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.34303608536720276,
+      "step": 1170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 954.0,
+      "completions/max_terminated_length": 954.0,
+      "completions/mean_length": 515.5982666015625,
+      "completions/mean_terminated_length": 515.5982666015625,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 1.2084085633221564,
+      "grad_norm": 0.7119922637939453,
+      "kl": 0.0936279296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0109,
+      "num_tokens": 135788927.0,
+      "reward": 1.3933037519454956,
+      "reward_std": 0.23654146492481232,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40223217010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.38181573152542114,
+      "step": 1171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2014.0,
+      "completions/max_terminated_length": 2014.0,
+      "completions/mean_length": 584.2053833007812,
+      "completions/mean_terminated_length": 584.2053833007812,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 1.209440288883157,
+      "grad_norm": 0.7124395966529846,
+      "kl": 0.0806884765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0204,
+      "num_tokens": 135935817.0,
+      "reward": 1.339285969734192,
+      "reward_std": 0.21949851512908936,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3392857015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.32427525520324707,
+      "step": 1172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1225.0,
+      "completions/max_terminated_length": 1225.0,
+      "completions/mean_length": 549.9910888671875,
+      "completions/mean_terminated_length": 549.9910888671875,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 1.2104720144441579,
+      "grad_norm": 0.4604604244232178,
+      "kl": 0.08837890625,
+      "learning_rate": 1e-06,
+      "loss": -0.002,
+      "num_tokens": 136064917.0,
+      "reward": 1.3205358982086182,
+      "reward_std": 0.0818769633769989,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32053571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.36394694447517395,
+      "step": 1173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1015.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 499.6607360839844,
+      "completions/mean_terminated_length": 499.6607360839844,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "epoch": 1.2115037400051587,
+      "grad_norm": 0.6104110479354858,
+      "kl": 0.0931396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.032,
+      "num_tokens": 136200364.0,
+      "reward": 1.3250001668930054,
+      "reward_std": 0.21099622547626495,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3657030463218689,
+      "step": 1174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1051.0,
+      "completions/max_terminated_length": 1051.0,
+      "completions/mean_length": 505.9285888671875,
+      "completions/mean_terminated_length": 505.9285888671875,
+      "completions/min_length": 292.0,
+      "completions/min_terminated_length": 292.0,
+      "epoch": 1.2125354655661593,
+      "grad_norm": 0.6829614639282227,
+      "kl": 0.09326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.011,
+      "num_tokens": 136324074.0,
+      "reward": 1.2803572416305542,
+      "reward_std": 0.1924617439508438,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28035715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.33694157004356384,
+      "step": 1175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1024.0,
+      "completions/mean_length": 505.2857360839844,
+      "completions/mean_terminated_length": 505.2857360839844,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.2135671911271602,
+      "grad_norm": 0.6645453572273254,
+      "kl": 0.0894775390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0119,
+      "num_tokens": 136454971.0,
+      "reward": 1.345982313156128,
+      "reward_std": 0.13870622217655182,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3459821343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3621842563152313,
+      "step": 1176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 899.0,
+      "completions/max_terminated_length": 899.0,
+      "completions/mean_length": 471.7857360839844,
+      "completions/mean_terminated_length": 471.7857360839844,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 1.214598916688161,
+      "grad_norm": 0.680292546749115,
+      "kl": 0.0860595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 136577650.0,
+      "reward": 1.4754464626312256,
+      "reward_std": 0.15931636095046997,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4754464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3732614517211914,
+      "step": 1177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1194.0,
+      "completions/max_terminated_length": 1194.0,
+      "completions/mean_length": 470.232177734375,
+      "completions/mean_terminated_length": 470.232177734375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 1.2156306422491618,
+      "grad_norm": 0.5292847752571106,
+      "kl": 0.087158203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0001,
+      "num_tokens": 136711404.0,
+      "reward": 1.441071629524231,
+      "reward_std": 0.13028402626514435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44107145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.3854454755783081,
+      "step": 1178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1195.0,
+      "completions/max_terminated_length": 1195.0,
+      "completions/mean_length": 458.732177734375,
+      "completions/mean_terminated_length": 458.732177734375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 1.2166623678101625,
+      "grad_norm": 0.645519495010376,
+      "kl": 0.09423828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0103,
+      "num_tokens": 136827679.0,
+      "reward": 1.5263392925262451,
+      "reward_std": 0.19883407652378082,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5352678894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.46443992853164673,
+      "step": 1179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 895.0,
+      "completions/max_terminated_length": 895.0,
+      "completions/mean_length": 496.83038330078125,
+      "completions/mean_terminated_length": 496.83038330078125,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 1.2176940933711633,
+      "grad_norm": 0.6841337084770203,
+      "kl": 0.0869140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0065,
+      "num_tokens": 136954927.0,
+      "reward": 1.3924108743667603,
+      "reward_std": 0.194316565990448,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3924107253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.3710351884365082,
+      "step": 1180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 776.0,
+      "completions/max_terminated_length": 776.0,
+      "completions/mean_length": 459.419677734375,
+      "completions/mean_terminated_length": 459.419677734375,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.2187258189321641,
+      "grad_norm": 0.576664388179779,
+      "kl": 0.091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 137073512.0,
+      "reward": 1.3700894117355347,
+      "reward_std": 0.1372222900390625,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3700892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.35926979780197144,
+      "step": 1181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1052.0,
+      "completions/max_terminated_length": 1052.0,
+      "completions/mean_length": 487.33038330078125,
+      "completions/mean_terminated_length": 487.33038330078125,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 1.2197575444931648,
+      "grad_norm": 0.6316289305686951,
+      "kl": 0.0909423828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 137192283.0,
+      "reward": 1.385267972946167,
+      "reward_std": 0.10963393747806549,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38526788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3733002543449402,
+      "step": 1182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 773.0,
+      "completions/max_terminated_length": 773.0,
+      "completions/mean_length": 414.46429443359375,
+      "completions/mean_terminated_length": 414.46429443359375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.2207892700541656,
+      "grad_norm": 0.6138463616371155,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0018,
+      "num_tokens": 137300758.0,
+      "reward": 1.5013395547866821,
+      "reward_std": 0.10109592229127884,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.501339316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.37092897295951843,
+      "step": 1183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1448.0,
+      "completions/max_terminated_length": 1448.0,
+      "completions/mean_length": 524.9732666015625,
+      "completions/mean_terminated_length": 524.9732666015625,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 1.2218209956151664,
+      "grad_norm": 0.6145269274711609,
+      "kl": 0.0877685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 137432362.0,
+      "reward": 1.2441965341567993,
+      "reward_std": 0.11928559094667435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24419642984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3394315540790558,
+      "step": 1184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 470.33929443359375,
+      "completions/mean_terminated_length": 470.33929443359375,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 1.222852721176167,
+      "grad_norm": 0.6667640805244446,
+      "kl": 0.090576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0137,
+      "num_tokens": 137550063.0,
+      "reward": 1.520535945892334,
+      "reward_std": 0.2047668844461441,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5205357074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.47998470067977905,
+      "step": 1185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1110.0,
+      "completions/max_terminated_length": 1110.0,
+      "completions/mean_length": 497.294677734375,
+      "completions/mean_terminated_length": 497.294677734375,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 1.223884446737168,
+      "grad_norm": 0.7046597003936768,
+      "kl": 0.0909423828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0219,
+      "num_tokens": 137672029.0,
+      "reward": 1.4477680921554565,
+      "reward_std": 0.18011516332626343,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44776788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3280702829360962,
+      "step": 1186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 888.0,
+      "completions/max_terminated_length": 888.0,
+      "completions/mean_length": 492.33929443359375,
+      "completions/mean_terminated_length": 492.33929443359375,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 1.2249161722981687,
+      "grad_norm": 0.5538958311080933,
+      "kl": 0.0887451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0113,
+      "num_tokens": 137799446.0,
+      "reward": 1.5080358982086182,
+      "reward_std": 0.12931200861930847,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5080357193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.42296433448791504,
+      "step": 1187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1035.0,
+      "completions/max_terminated_length": 1035.0,
+      "completions/mean_length": 459.9107360839844,
+      "completions/mean_terminated_length": 459.9107360839844,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 1.2259478978591694,
+      "grad_norm": 0.753014326095581,
+      "kl": 0.091064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 137922853.0,
+      "reward": 1.4089287519454956,
+      "reward_std": 0.19899839162826538,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40892860293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.3784281611442566,
+      "step": 1188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1130.0,
+      "completions/max_terminated_length": 1130.0,
+      "completions/mean_length": 473.27679443359375,
+      "completions/mean_terminated_length": 473.27679443359375,
+      "completions/min_length": 127.0,
+      "completions/min_terminated_length": 127.0,
+      "epoch": 1.2269796234201702,
+      "grad_norm": 0.6941073536872864,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0333,
+      "num_tokens": 138040449.0,
+      "reward": 1.4625002145767212,
+      "reward_std": 0.16561460494995117,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46250003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.38022634387016296,
+      "step": 1189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1111.0,
+      "completions/max_terminated_length": 1111.0,
+      "completions/mean_length": 498.8125305175781,
+      "completions/mean_terminated_length": 498.8125305175781,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.228011348981171,
+      "grad_norm": 0.691802978515625,
+      "kl": 0.0828857421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 138156019.0,
+      "reward": 1.3794643878936768,
+      "reward_std": 0.23291635513305664,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3794642984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3979991376399994,
+      "step": 1190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 828.0,
+      "completions/max_terminated_length": 828.0,
+      "completions/mean_length": 450.7232360839844,
+      "completions/mean_terminated_length": 450.7232360839844,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.2290430745421719,
+      "grad_norm": 0.5977465510368347,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 138269297.0,
+      "reward": 1.4200893640518188,
+      "reward_std": 0.16302047669887543,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4200893044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.4009819030761719,
+      "step": 1191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 867.0,
+      "completions/max_terminated_length": 867.0,
+      "completions/mean_length": 479.83929443359375,
+      "completions/mean_terminated_length": 479.83929443359375,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 1.2300748001031725,
+      "grad_norm": 0.6863408088684082,
+      "kl": 0.094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0179,
+      "num_tokens": 138391751.0,
+      "reward": 1.440178632736206,
+      "reward_std": 0.16329091787338257,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44017860293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.3516908586025238,
+      "step": 1192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 878.0,
+      "completions/max_terminated_length": 878.0,
+      "completions/mean_length": 510.39288330078125,
+      "completions/mean_terminated_length": 510.39288330078125,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.2311065256641733,
+      "grad_norm": 0.5389936566352844,
+      "kl": 0.086669921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0005,
+      "num_tokens": 138518957.0,
+      "reward": 1.3169643878936768,
+      "reward_std": 0.12550340592861176,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3169642984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.35141628980636597,
+      "step": 1193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1196.0,
+      "completions/max_terminated_length": 1196.0,
+      "completions/mean_length": 497.8482360839844,
+      "completions/mean_terminated_length": 497.8482360839844,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 1.2321382512251742,
+      "grad_norm": 0.7427835464477539,
+      "kl": 0.09423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 138641720.0,
+      "reward": 1.3638393878936768,
+      "reward_std": 0.2322855442762375,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3638392984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.36392563581466675,
+      "step": 1194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1054.0,
+      "completions/max_terminated_length": 1054.0,
+      "completions/mean_length": 473.77679443359375,
+      "completions/mean_terminated_length": 473.77679443359375,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 1.2331699767861748,
+      "grad_norm": 0.663589596748352,
+      "kl": 0.0946044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 138770775.0,
+      "reward": 1.354017972946167,
+      "reward_std": 0.16192704439163208,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35401788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.38368406891822815,
+      "step": 1195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1145.0,
+      "completions/max_terminated_length": 1145.0,
+      "completions/mean_length": 485.40179443359375,
+      "completions/mean_terminated_length": 485.40179443359375,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 1.2342017023471756,
+      "grad_norm": 0.6391304135322571,
+      "kl": 0.0919189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 138890863.0,
+      "reward": 1.35535728931427,
+      "reward_std": 0.13343793153762817,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3553571403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.35624146461486816,
+      "step": 1196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1354.0,
+      "completions/max_terminated_length": 1354.0,
+      "completions/mean_length": 476.0714416503906,
+      "completions/mean_terminated_length": 476.0714416503906,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 1.2352334279081765,
+      "grad_norm": 0.8422456979751587,
+      "kl": 0.0897216796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0077,
+      "num_tokens": 139000687.0,
+      "reward": 1.3571429252624512,
+      "reward_std": 0.2340608388185501,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3660714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3382662832736969,
+      "step": 1197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 816.0,
+      "completions/max_terminated_length": 816.0,
+      "completions/mean_length": 464.294677734375,
+      "completions/mean_terminated_length": 464.294677734375,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.2362651534691773,
+      "grad_norm": 0.7451651692390442,
+      "kl": 0.0931396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 139119384.0,
+      "reward": 1.3245537281036377,
+      "reward_std": 0.27345478534698486,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33348217606544495,
+      "rewards/curriculum_aware_reward_fn/std": 0.3578834533691406,
+      "step": 1198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1026.0,
+      "completions/max_terminated_length": 1026.0,
+      "completions/mean_length": 491.4732360839844,
+      "completions/mean_terminated_length": 491.4732360839844,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 1.237296879030178,
+      "grad_norm": 0.7376614809036255,
+      "kl": 0.0958251953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0344,
+      "num_tokens": 139246018.0,
+      "reward": 1.3772321939468384,
+      "reward_std": 0.22673317790031433,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.377232164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.4012867212295532,
+      "step": 1199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 761.0,
+      "completions/max_terminated_length": 761.0,
+      "completions/mean_length": 470.607177734375,
+      "completions/mean_terminated_length": 470.607177734375,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.2383286045911788,
+      "grad_norm": 0.6506025791168213,
+      "kl": 0.0880126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.026,
+      "num_tokens": 139366850.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.18790303170681,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35580357909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.3913332521915436,
+      "step": 1200
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 934.0,
+      "completions/max_terminated_length": 934.0,
+      "completions/mean_length": 450.732177734375,
+      "completions/mean_terminated_length": 450.732177734375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 1.2393603301521796,
+      "grad_norm": 0.7297881245613098,
+      "kl": 0.093505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 139491259.0,
+      "reward": 1.4642857313156128,
+      "reward_std": 0.12518589198589325,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4642857015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.3733479082584381,
+      "step": 1201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1011.0,
+      "completions/max_terminated_length": 1011.0,
+      "completions/mean_length": 479.58038330078125,
+      "completions/mean_terminated_length": 479.58038330078125,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 1.2403920557131802,
+      "grad_norm": 0.7408363819122314,
+      "kl": 0.091064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0043,
+      "num_tokens": 139618738.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.1732882559299469,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3386533856391907,
+      "step": 1202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1396.0,
+      "completions/max_terminated_length": 1396.0,
+      "completions/mean_length": 521.4732666015625,
+      "completions/mean_terminated_length": 521.4732666015625,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 1.241423781274181,
+      "grad_norm": 0.6051561236381531,
+      "kl": 0.0849609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0247,
+      "num_tokens": 139739646.0,
+      "reward": 1.4366072416305542,
+      "reward_std": 0.20512039959430695,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3702053129673004,
+      "step": 1203
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 873.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 416.0357360839844,
+      "completions/mean_terminated_length": 416.0357360839844,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 1.242455506835182,
+      "grad_norm": 0.7176798582077026,
+      "kl": 0.0919189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 139847583.0,
+      "reward": 1.4732143878936768,
+      "reward_std": 0.200529545545578,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4821428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3688829839229584,
+      "step": 1204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1159.0,
+      "completions/max_terminated_length": 1159.0,
+      "completions/mean_length": 477.2410888671875,
+      "completions/mean_terminated_length": 477.2410888671875,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.2434872323961825,
+      "grad_norm": 0.6013513207435608,
+      "kl": 0.09033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0085,
+      "num_tokens": 139974079.0,
+      "reward": 1.352678656578064,
+      "reward_std": 0.14784403145313263,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3526785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.37888169288635254,
+      "step": 1205
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1064.0,
+      "completions/max_terminated_length": 1064.0,
+      "completions/mean_length": 496.0982360839844,
+      "completions/mean_terminated_length": 496.0982360839844,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 1.2445189579571834,
+      "grad_norm": 0.6596572399139404,
+      "kl": 0.0899658203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0175,
+      "num_tokens": 140102995.0,
+      "reward": 1.305803656578064,
+      "reward_std": 0.11023427546024323,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3058035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.3495650887489319,
+      "step": 1206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1341.0,
+      "completions/max_terminated_length": 1341.0,
+      "completions/mean_length": 471.607177734375,
+      "completions/mean_terminated_length": 471.607177734375,
+      "completions/min_length": 133.0,
+      "completions/min_terminated_length": 133.0,
+      "epoch": 1.2455506835181842,
+      "grad_norm": 0.7289984226226807,
+      "kl": 0.0968017578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0078,
+      "num_tokens": 140223778.0,
+      "reward": 1.3375000953674316,
+      "reward_std": 0.15725064277648926,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33750003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.3675766885280609,
+      "step": 1207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 857.0,
+      "completions/max_terminated_length": 857.0,
+      "completions/mean_length": 428.8750305175781,
+      "completions/mean_terminated_length": 428.8750305175781,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.2465824090791848,
+      "grad_norm": 0.7640532851219177,
+      "kl": 0.1015625,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 140337339.0,
+      "reward": 1.41785728931427,
+      "reward_std": 0.19161218404769897,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41785717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3349638283252716,
+      "step": 1208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1253.0,
+      "completions/max_terminated_length": 1253.0,
+      "completions/mean_length": 467.169677734375,
+      "completions/mean_terminated_length": 467.169677734375,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 1.2476141346401857,
+      "grad_norm": 0.7718006372451782,
+      "kl": 0.097412109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0086,
+      "num_tokens": 140459179.0,
+      "reward": 1.3183037042617798,
+      "reward_std": 0.17381516098976135,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32723215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.3610475957393646,
+      "step": 1209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 514.2053833007812,
+      "completions/mean_terminated_length": 514.2053833007812,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 1.2486458602011865,
+      "grad_norm": 0.6831369996070862,
+      "kl": 0.0916748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0049,
+      "num_tokens": 140586602.0,
+      "reward": 1.288839340209961,
+      "reward_std": 0.21529847383499146,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28883931040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.3663620352745056,
+      "step": 1210
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1083.0,
+      "completions/max_terminated_length": 1083.0,
+      "completions/mean_length": 455.8482360839844,
+      "completions/mean_terminated_length": 455.8482360839844,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 1.2496775857621873,
+      "grad_norm": 0.749707818031311,
+      "kl": 0.087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0339,
+      "num_tokens": 140707727.0,
+      "reward": 1.3361608982086182,
+      "reward_std": 0.1401582509279251,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3828067183494568,
+      "step": 1211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 473.7410888671875,
+      "completions/mean_terminated_length": 473.7410888671875,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 1.250709311323188,
+      "grad_norm": 0.679556667804718,
+      "kl": 0.0899658203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0213,
+      "num_tokens": 140828455.0,
+      "reward": 1.4352679252624512,
+      "reward_std": 0.20762227475643158,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.37654417753219604,
+      "step": 1212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 984.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 533.0178833007812,
+      "completions/mean_terminated_length": 533.0178833007812,
+      "completions/min_length": 297.0,
+      "completions/min_terminated_length": 297.0,
+      "epoch": 1.2517410368841888,
+      "grad_norm": 0.7216713428497314,
+      "kl": 0.0875244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0055,
+      "num_tokens": 140962094.0,
+      "reward": 1.3258929252624512,
+      "reward_std": 0.18453404307365417,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3258928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3267352879047394,
+      "step": 1213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 833.0,
+      "completions/max_terminated_length": 833.0,
+      "completions/mean_length": 470.7232360839844,
+      "completions/mean_terminated_length": 470.7232360839844,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 1.2527727624451896,
+      "grad_norm": 0.6506356000900269,
+      "kl": 0.094482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 141081994.0,
+      "reward": 1.2892858982086182,
+      "reward_std": 0.15539708733558655,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28928571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3660987913608551,
+      "step": 1214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2342.0,
+      "completions/max_terminated_length": 2342.0,
+      "completions/mean_length": 522.357177734375,
+      "completions/mean_terminated_length": 522.357177734375,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 1.2538044880061903,
+      "grad_norm": 0.6972716450691223,
+      "kl": 0.0882568359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 141211828.0,
+      "reward": 1.394196629524231,
+      "reward_std": 0.17642062902450562,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39419645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.3766702115535736,
+      "step": 1215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1132.0,
+      "completions/max_terminated_length": 1132.0,
+      "completions/mean_length": 558.7767944335938,
+      "completions/mean_terminated_length": 558.7767944335938,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 1.254836213567191,
+      "grad_norm": 0.6484391689300537,
+      "kl": 0.0782470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 141347953.0,
+      "reward": 1.3245537281036377,
+      "reward_std": 0.15901994705200195,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33348211646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.32510486245155334,
+      "step": 1216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1342.0,
+      "completions/max_terminated_length": 1342.0,
+      "completions/mean_length": 426.4732360839844,
+      "completions/mean_terminated_length": 426.4732360839844,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.255867939128192,
+      "grad_norm": 0.5722562074661255,
+      "kl": 0.1002197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 141459446.0,
+      "reward": 1.4455357789993286,
+      "reward_std": 0.15726637840270996,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45446428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.39952415227890015,
+      "step": 1217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1195.0,
+      "completions/max_terminated_length": 1195.0,
+      "completions/mean_length": 458.4285888671875,
+      "completions/mean_terminated_length": 458.4285888671875,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.2568996646891928,
+      "grad_norm": 0.8675462007522583,
+      "kl": 0.0947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0206,
+      "num_tokens": 141578242.0,
+      "reward": 1.4223216772079468,
+      "reward_std": 0.219200998544693,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42232146859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.3711947798728943,
+      "step": 1218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 496.3660888671875,
+      "completions/mean_terminated_length": 496.3660888671875,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 1.2579313902501934,
+      "grad_norm": 0.6168084144592285,
+      "kl": 0.088623046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0305,
+      "num_tokens": 141699796.0,
+      "reward": 1.4183037281036377,
+      "reward_std": 0.18364112079143524,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41830357909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.38816937804222107,
+      "step": 1219
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1291.0,
+      "completions/max_terminated_length": 1291.0,
+      "completions/mean_length": 486.02679443359375,
+      "completions/mean_terminated_length": 486.02679443359375,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.2589631158111942,
+      "grad_norm": 0.7693423628807068,
+      "kl": 0.0953369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0296,
+      "num_tokens": 141823097.0,
+      "reward": 1.360267996788025,
+      "reward_std": 0.18344184756278992,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36026784777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.359375,
+      "step": 1220
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 828.0,
+      "completions/max_terminated_length": 828.0,
+      "completions/mean_length": 452.0625305175781,
+      "completions/mean_terminated_length": 452.0625305175781,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.2599948413721949,
+      "grad_norm": 0.7499447464942932,
+      "kl": 0.103271484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0139,
+      "num_tokens": 141945618.0,
+      "reward": 1.512946605682373,
+      "reward_std": 0.15902185440063477,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.469463050365448,
+      "step": 1221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 758.0,
+      "completions/max_terminated_length": 758.0,
+      "completions/mean_length": 440.4464416503906,
+      "completions/mean_terminated_length": 440.4464416503906,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 1.2610265669331957,
+      "grad_norm": 0.7502613663673401,
+      "kl": 0.1002197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0213,
+      "num_tokens": 142059187.0,
+      "reward": 1.47633957862854,
+      "reward_std": 0.2236849069595337,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47633931040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.36242181062698364,
+      "step": 1222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1683.0,
+      "completions/max_terminated_length": 1683.0,
+      "completions/mean_length": 499.5714416503906,
+      "completions/mean_terminated_length": 499.5714416503906,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 1.2620582924941965,
+      "grad_norm": 0.7217331528663635,
+      "kl": 0.0931396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 142181331.0,
+      "reward": 1.4142858982086182,
+      "reward_std": 0.20921562612056732,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41428571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.38193607330322266,
+      "step": 1223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1464.0,
+      "completions/max_terminated_length": 1464.0,
+      "completions/mean_length": 482.5625305175781,
+      "completions/mean_terminated_length": 482.5625305175781,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 1.2630900180551974,
+      "grad_norm": 0.7930240035057068,
+      "kl": 0.095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 142309739.0,
+      "reward": 1.2544643878936768,
+      "reward_std": 0.19663681089878082,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.323270320892334,
+      "step": 1224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 833.0,
+      "completions/max_terminated_length": 833.0,
+      "completions/mean_length": 468.1964416503906,
+      "completions/mean_terminated_length": 468.1964416503906,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "epoch": 1.2641217436161982,
+      "grad_norm": 0.8367300033569336,
+      "kl": 0.103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0218,
+      "num_tokens": 142435817.0,
+      "reward": 1.4767858982086182,
+      "reward_std": 0.2238132655620575,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3726535439491272,
+      "step": 1225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1131.0,
+      "completions/max_terminated_length": 1131.0,
+      "completions/mean_length": 522.6875,
+      "completions/mean_terminated_length": 522.6875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.2651534691771988,
+      "grad_norm": 0.7651290893554688,
+      "kl": 0.0888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0097,
+      "num_tokens": 142563679.0,
+      "reward": 1.3316963911056519,
+      "reward_std": 0.1838735193014145,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33169645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.3260461688041687,
+      "step": 1226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1438.0,
+      "completions/max_terminated_length": 1438.0,
+      "completions/mean_length": 538.2142944335938,
+      "completions/mean_terminated_length": 538.2142944335938,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 1.2661851947381997,
+      "grad_norm": 0.7349340319633484,
+      "kl": 0.091796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 142701176.0,
+      "reward": 1.3214287757873535,
+      "reward_std": 0.23082859814167023,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3214285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.35097768902778625,
+      "step": 1227
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 816.0,
+      "completions/max_terminated_length": 816.0,
+      "completions/mean_length": 471.2232360839844,
+      "completions/mean_terminated_length": 471.2232360839844,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "epoch": 1.2672169202992003,
+      "grad_norm": 0.7736876010894775,
+      "kl": 0.10205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 142821762.0,
+      "reward": 1.4781252145767212,
+      "reward_std": 0.15466152131557465,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4781250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3568998873233795,
+      "step": 1228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 466.9285888671875,
+      "completions/mean_terminated_length": 466.9285888671875,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 1.2682486458602011,
+      "grad_norm": 0.6920589208602905,
+      "kl": 0.10595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0032,
+      "num_tokens": 142940460.0,
+      "reward": 1.3821431398391724,
+      "reward_std": 0.1856403797864914,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.3613573908805847,
+      "step": 1229
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 472.9285888671875,
+      "completions/mean_terminated_length": 472.9285888671875,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 1.269280371421202,
+      "grad_norm": 0.7609573602676392,
+      "kl": 0.1005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0077,
+      "num_tokens": 143061762.0,
+      "reward": 1.394196629524231,
+      "reward_std": 0.17537376284599304,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39419645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.34613344073295593,
+      "step": 1230
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 934.0,
+      "completions/max_terminated_length": 934.0,
+      "completions/mean_length": 481.7232360839844,
+      "completions/mean_terminated_length": 481.7232360839844,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 1.2703120969822028,
+      "grad_norm": 0.7466670870780945,
+      "kl": 0.0968017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0211,
+      "num_tokens": 143182726.0,
+      "reward": 1.4718750715255737,
+      "reward_std": 0.1868813931941986,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47187498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.36010393500328064,
+      "step": 1231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1468.0,
+      "completions/max_terminated_length": 1468.0,
+      "completions/mean_length": 538.8660888671875,
+      "completions/mean_terminated_length": 538.8660888671875,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 1.2713438225432034,
+      "grad_norm": 0.7122594714164734,
+      "kl": 0.097412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0168,
+      "num_tokens": 143315263.0,
+      "reward": 1.2933037281036377,
+      "reward_std": 0.19857461750507355,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29330357909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.3709918260574341,
+      "step": 1232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1390.0,
+      "completions/max_terminated_length": 1390.0,
+      "completions/mean_length": 533.9553833007812,
+      "completions/mean_terminated_length": 533.9553833007812,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 1.2723755481042043,
+      "grad_norm": 0.6058053970336914,
+      "kl": 0.09716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0121,
+      "num_tokens": 143442343.0,
+      "reward": 1.3241071701049805,
+      "reward_std": 0.12093013525009155,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32410717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3290706276893616,
+      "step": 1233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1076.0,
+      "completions/max_terminated_length": 1076.0,
+      "completions/mean_length": 463.2857360839844,
+      "completions/mean_terminated_length": 463.2857360839844,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.2734072736652051,
+      "grad_norm": 0.7048044204711914,
+      "kl": 0.0992431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 143563230.0,
+      "reward": 1.4361608028411865,
+      "reward_std": 0.19449612498283386,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.3961503803730011,
+      "step": 1234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1277.0,
+      "completions/max_terminated_length": 1277.0,
+      "completions/mean_length": 503.8125305175781,
+      "completions/mean_terminated_length": 503.8125305175781,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.2744389992262057,
+      "grad_norm": 0.7165812253952026,
+      "kl": 0.0982666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.018,
+      "num_tokens": 143683203.0,
+      "reward": 1.3352679014205933,
+      "reward_std": 0.17653267085552216,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3352678716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.35789918899536133,
+      "step": 1235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1197.0,
+      "completions/max_terminated_length": 1197.0,
+      "completions/mean_length": 555.8660888671875,
+      "completions/mean_terminated_length": 555.8660888671875,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 1.2754707247872066,
+      "grad_norm": 0.6849974393844604,
+      "kl": 0.09130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0134,
+      "num_tokens": 143811036.0,
+      "reward": 1.2950893640518188,
+      "reward_std": 0.14985202252864838,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2950892746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.31615370512008667,
+      "step": 1236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1254.0,
+      "completions/max_terminated_length": 1254.0,
+      "completions/mean_length": 580.9017944335938,
+      "completions/mean_terminated_length": 580.9017944335938,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "epoch": 1.2765024503482074,
+      "grad_norm": 0.6134920120239258,
+      "kl": 0.0875244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 143947000.0,
+      "reward": 1.3066965341567993,
+      "reward_std": 0.12845320999622345,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30669644474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.3435044288635254,
+      "step": 1237
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1131.0,
+      "completions/max_terminated_length": 1131.0,
+      "completions/mean_length": 469.8214416503906,
+      "completions/mean_terminated_length": 469.8214416503906,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 1.2775341759092083,
+      "grad_norm": 0.7334262132644653,
+      "kl": 0.0987548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 144062697.0,
+      "reward": 1.4803574085235596,
+      "reward_std": 0.16172140836715698,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48035717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3639966547489166,
+      "step": 1238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 500.0000305175781,
+      "completions/mean_terminated_length": 500.0000305175781,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 1.2785659014702089,
+      "grad_norm": 0.7100305557250977,
+      "kl": 0.1038818359375,
+      "learning_rate": 1e-06,
+      "loss": -0.005,
+      "num_tokens": 144200359.0,
+      "reward": 1.3312500715255737,
+      "reward_std": 0.18355976045131683,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3559738099575043,
+      "step": 1239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1698.0,
+      "completions/max_terminated_length": 1698.0,
+      "completions/mean_length": 493.08038330078125,
+      "completions/mean_terminated_length": 493.08038330078125,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 1.2795976270312097,
+      "grad_norm": 0.7557586431503296,
+      "kl": 0.11279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0167,
+      "num_tokens": 144329063.0,
+      "reward": 1.416517972946167,
+      "reward_std": 0.1822463870048523,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41651788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3514697849750519,
+      "step": 1240
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 827.0,
+      "completions/max_terminated_length": 827.0,
+      "completions/mean_length": 476.7232360839844,
+      "completions/mean_terminated_length": 476.7232360839844,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 1.2806293525922103,
+      "grad_norm": 0.7205198407173157,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0231,
+      "num_tokens": 144451168.0,
+      "reward": 1.2892858982086182,
+      "reward_std": 0.14720794558525085,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28928571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.34297508001327515,
+      "step": 1241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 450.5535888671875,
+      "completions/mean_terminated_length": 450.5535888671875,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.2816610781532112,
+      "grad_norm": 0.712121844291687,
+      "kl": 0.1102294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0158,
+      "num_tokens": 144566814.0,
+      "reward": 1.3857144117355347,
+      "reward_std": 0.1151876151561737,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3857143223285675,
+      "rewards/curriculum_aware_reward_fn/std": 0.3716549575328827,
+      "step": 1242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1576.0,
+      "completions/max_terminated_length": 1576.0,
+      "completions/mean_length": 524.2053833007812,
+      "completions/mean_terminated_length": 524.2053833007812,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 1.282692803714212,
+      "grad_norm": 0.7179633975028992,
+      "kl": 0.100341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 144695784.0,
+      "reward": 1.3245537281036377,
+      "reward_std": 0.18321338295936584,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32455357909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.350546270608902,
+      "step": 1243
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1109.0,
+      "completions/max_terminated_length": 1109.0,
+      "completions/mean_length": 489.5982360839844,
+      "completions/mean_terminated_length": 489.5982360839844,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 1.2837245292752129,
+      "grad_norm": 0.6813969016075134,
+      "kl": 0.1053466796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0007,
+      "num_tokens": 144831938.0,
+      "reward": 1.3406251668930054,
+      "reward_std": 0.15983448922634125,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34955358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3607734441757202,
+      "step": 1244
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 873.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 472.6607360839844,
+      "completions/mean_terminated_length": 472.6607360839844,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 1.2847562548362137,
+      "grad_norm": 0.7200454473495483,
+      "kl": 0.0999755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0254,
+      "num_tokens": 144951068.0,
+      "reward": 1.3401787281036377,
+      "reward_std": 0.1479780077934265,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34017854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3724668025970459,
+      "step": 1245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1053.0,
+      "completions/max_terminated_length": 1053.0,
+      "completions/mean_length": 528.5178833007812,
+      "completions/mean_terminated_length": 528.5178833007812,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.2857879803972143,
+      "grad_norm": 0.74029541015625,
+      "kl": 0.105712890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0062,
+      "num_tokens": 145078841.0,
+      "reward": 1.2727679014205933,
+      "reward_std": 0.22440676391124725,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2727678716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.34880027174949646,
+      "step": 1246
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 998.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 457.6875305175781,
+      "completions/mean_terminated_length": 457.6875305175781,
+      "completions/min_length": 278.0,
+      "completions/min_terminated_length": 278.0,
+      "epoch": 1.2868197059582152,
+      "grad_norm": 0.7149326205253601,
+      "kl": 0.1051025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0064,
+      "num_tokens": 145197015.0,
+      "reward": 1.376339316368103,
+      "reward_std": 0.16753913462162018,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37633928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3888050615787506,
+      "step": 1247
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 695.0,
+      "completions/max_terminated_length": 695.0,
+      "completions/mean_length": 428.2946472167969,
+      "completions/mean_terminated_length": 428.2946472167969,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.2878514315192158,
+      "grad_norm": 0.6227552890777588,
+      "kl": 0.0963134765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0127,
+      "num_tokens": 145303770.0,
+      "reward": 1.481696605682373,
+      "reward_std": 0.13157878816127777,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4816964268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.3921525180339813,
+      "step": 1248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 760.0,
+      "completions/max_terminated_length": 760.0,
+      "completions/mean_length": 469.3125305175781,
+      "completions/mean_terminated_length": 469.3125305175781,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "epoch": 1.2888831570802166,
+      "grad_norm": 0.7807196378707886,
+      "kl": 0.095458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 145429217.0,
+      "reward": 1.4250000715255737,
+      "reward_std": 0.20809811353683472,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3775215744972229,
+      "step": 1249
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 887.0,
+      "completions/max_terminated_length": 887.0,
+      "completions/mean_length": 445.9732360839844,
+      "completions/mean_terminated_length": 445.9732360839844,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 1.2899148826412175,
+      "grad_norm": 0.8612940907478333,
+      "kl": 0.115966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0091,
+      "num_tokens": 145542145.0,
+      "reward": 1.3575894832611084,
+      "reward_std": 0.21384631097316742,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3665178716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.3716525435447693,
+      "step": 1250
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1211.0,
+      "completions/max_terminated_length": 1211.0,
+      "completions/mean_length": 463.5625305175781,
+      "completions/mean_terminated_length": 463.5625305175781,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 1.2909466082022183,
+      "grad_norm": 0.756700336933136,
+      "kl": 0.0977783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0401,
+      "num_tokens": 145656441.0,
+      "reward": 1.4415180683135986,
+      "reward_std": 0.21089234948158264,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4504464566707611,
+      "rewards/curriculum_aware_reward_fn/std": 0.3938431441783905,
+      "step": 1251
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 865.0,
+      "completions/max_terminated_length": 865.0,
+      "completions/mean_length": 425.0089416503906,
+      "completions/mean_terminated_length": 425.0089416503906,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 1.291978333763219,
+      "grad_norm": 0.7736772298812866,
+      "kl": 0.100830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0169,
+      "num_tokens": 145768883.0,
+      "reward": 1.4053571224212646,
+      "reward_std": 0.10741391032934189,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41428571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3921199440956116,
+      "step": 1252
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1217.0,
+      "completions/max_terminated_length": 1217.0,
+      "completions/mean_length": 450.26788330078125,
+      "completions/mean_terminated_length": 450.26788330078125,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.2930100593242198,
+      "grad_norm": 0.744135320186615,
+      "kl": 0.103271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0131,
+      "num_tokens": 145879464.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.1737460494041443,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35312503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.37688153982162476,
+      "step": 1253
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 827.0,
+      "completions/max_terminated_length": 827.0,
+      "completions/mean_length": 434.8125305175781,
+      "completions/mean_terminated_length": 434.8125305175781,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.2940417848852206,
+      "grad_norm": 0.680141270160675,
+      "kl": 0.1029052734375,
+      "learning_rate": 1e-06,
+      "loss": -0.003,
+      "num_tokens": 146001166.0,
+      "reward": 1.2763392925262451,
+      "reward_std": 0.14037790894508362,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2763392925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.36791107058525085,
+      "step": 1254
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 414.51788330078125,
+      "completions/mean_terminated_length": 414.51788330078125,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 1.2950735104462212,
+      "grad_norm": 0.6044324636459351,
+      "kl": 0.1021728515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 146109787.0,
+      "reward": 1.4133929014205933,
+      "reward_std": 0.11665144562721252,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4133928716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.3988712728023529,
+      "step": 1255
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 898.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 460.6875305175781,
+      "completions/mean_terminated_length": 460.6875305175781,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.296105236007222,
+      "grad_norm": 0.7419890761375427,
+      "kl": 0.09716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0097,
+      "num_tokens": 146228493.0,
+      "reward": 1.2852680683135986,
+      "reward_std": 0.1637239307165146,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2852678596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.3315020799636841,
+      "step": 1256
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1421.0,
+      "completions/max_terminated_length": 1421.0,
+      "completions/mean_length": 530.1964721679688,
+      "completions/mean_terminated_length": 530.1964721679688,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 1.297136961568223,
+      "grad_norm": 0.6371442675590515,
+      "kl": 0.084228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 146357812.0,
+      "reward": 1.391517996788025,
+      "reward_std": 0.163621187210083,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39151787757873535,
+      "rewards/curriculum_aware_reward_fn/std": 0.3548815846443176,
+      "step": 1257
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 721.0,
+      "completions/max_terminated_length": 721.0,
+      "completions/mean_length": 434.0357360839844,
+      "completions/mean_terminated_length": 434.0357360839844,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 1.2981686871292237,
+      "grad_norm": 0.7808496952056885,
+      "kl": 0.102783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0118,
+      "num_tokens": 146469809.0,
+      "reward": 1.4352679252624512,
+      "reward_std": 0.21879854798316956,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4352678954601288,
+      "rewards/curriculum_aware_reward_fn/std": 0.4187948405742645,
+      "step": 1258
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1025.0,
+      "completions/max_terminated_length": 1025.0,
+      "completions/mean_length": 463.232177734375,
+      "completions/mean_terminated_length": 463.232177734375,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.2992004126902243,
+      "grad_norm": 0.660305917263031,
+      "kl": 0.115478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0152,
+      "num_tokens": 146599006.0,
+      "reward": 1.3049108982086182,
+      "reward_std": 0.13474413752555847,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31383928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3810375928878784,
+      "step": 1259
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1057.0,
+      "completions/max_terminated_length": 1057.0,
+      "completions/mean_length": 466.794677734375,
+      "completions/mean_terminated_length": 466.794677734375,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 1.3002321382512252,
+      "grad_norm": 0.7298482656478882,
+      "kl": 0.1090087890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0119,
+      "num_tokens": 146721819.0,
+      "reward": 1.383928656578064,
+      "reward_std": 0.18910807371139526,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3839285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.47125548124313354,
+      "step": 1260
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 868.0,
+      "completions/max_terminated_length": 868.0,
+      "completions/mean_length": 467.2410888671875,
+      "completions/mean_terminated_length": 467.2410888671875,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.301263863812226,
+      "grad_norm": 0.7236865758895874,
+      "kl": 0.09375,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 146840370.0,
+      "reward": 1.3031251430511475,
+      "reward_std": 0.18334250152111053,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3031249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3641156852245331,
+      "step": 1261
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 688.0,
+      "completions/max_terminated_length": 688.0,
+      "completions/mean_length": 411.4375305175781,
+      "completions/mean_terminated_length": 411.4375305175781,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "epoch": 1.3022955893732266,
+      "grad_norm": 0.8327616453170776,
+      "kl": 0.1168212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0095,
+      "num_tokens": 146957991.0,
+      "reward": 1.3495537042617798,
+      "reward_std": 0.17797954380512238,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34955358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3442249298095703,
+      "step": 1262
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 946.0,
+      "completions/max_terminated_length": 946.0,
+      "completions/mean_length": 479.1607360839844,
+      "completions/mean_terminated_length": 479.1607360839844,
+      "completions/min_length": 288.0,
+      "completions/min_terminated_length": 288.0,
+      "epoch": 1.3033273149342275,
+      "grad_norm": 0.6464178562164307,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0005,
+      "num_tokens": 147089532.0,
+      "reward": 1.341071605682373,
+      "reward_std": 0.21358288824558258,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3410714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.38020941615104675,
+      "step": 1263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1121.0,
+      "completions/max_terminated_length": 1121.0,
+      "completions/mean_length": 440.26788330078125,
+      "completions/mean_terminated_length": 440.26788330078125,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 1.3043590404952283,
+      "grad_norm": 0.830440878868103,
+      "kl": 0.1002197265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0086,
+      "num_tokens": 147208030.0,
+      "reward": 1.49598228931427,
+      "reward_std": 0.17758455872535706,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49598217010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3642923831939697,
+      "step": 1264
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 935.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 464.58038330078125,
+      "completions/mean_terminated_length": 464.58038330078125,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.3053907660562292,
+      "grad_norm": 0.7556714415550232,
+      "kl": 0.1107177734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0145,
+      "num_tokens": 147333395.0,
+      "reward": 1.3482143878936768,
+      "reward_std": 0.18016089498996735,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3571428954601288,
+      "rewards/curriculum_aware_reward_fn/std": 0.3552601933479309,
+      "step": 1265
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 806.0,
+      "completions/max_terminated_length": 806.0,
+      "completions/mean_length": 444.0000305175781,
+      "completions/mean_terminated_length": 444.0000305175781,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.3064224916172298,
+      "grad_norm": 0.7321702837944031,
+      "kl": 0.107177734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0087,
+      "num_tokens": 147453074.0,
+      "reward": 1.2883929014205933,
+      "reward_std": 0.1664956659078598,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2883928716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.35526806116104126,
+      "step": 1266
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 977.0,
+      "completions/max_terminated_length": 977.0,
+      "completions/mean_length": 484.77679443359375,
+      "completions/mean_terminated_length": 484.77679443359375,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.3074542171782306,
+      "grad_norm": 0.7560392022132874,
+      "kl": 0.09521484375,
+      "learning_rate": 1e-06,
+      "loss": 0.026,
+      "num_tokens": 147577327.0,
+      "reward": 1.383928656578064,
+      "reward_std": 0.150946244597435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3839285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.36287233233451843,
+      "step": 1267
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 792.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 443.232177734375,
+      "completions/mean_terminated_length": 443.232177734375,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.3084859427392312,
+      "grad_norm": 0.7266552448272705,
+      "kl": 0.1044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 147688625.0,
+      "reward": 1.3571429252624512,
+      "reward_std": 0.16170468926429749,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.35923197865486145,
+      "step": 1268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 808.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 477.5089416503906,
+      "completions/mean_terminated_length": 477.5089416503906,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.309517668300232,
+      "grad_norm": 0.9126338958740234,
+      "kl": 0.100830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0169,
+      "num_tokens": 147816593.0,
+      "reward": 1.3531252145767212,
+      "reward_std": 0.22347509860992432,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35312503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.35623666644096375,
+      "step": 1269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 497.39288330078125,
+      "completions/mean_terminated_length": 497.39288330078125,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 1.310549393861233,
+      "grad_norm": 0.7279953956604004,
+      "kl": 0.1009521484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0142,
+      "num_tokens": 147941216.0,
+      "reward": 1.3294644355773926,
+      "reward_std": 0.1885773241519928,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33839288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.33785107731819153,
+      "step": 1270
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1239.0,
+      "completions/max_terminated_length": 1239.0,
+      "completions/mean_length": 463.5714416503906,
+      "completions/mean_terminated_length": 463.5714416503906,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.3115811194222338,
+      "grad_norm": 0.6684086322784424,
+      "kl": 0.1041259765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0024,
+      "num_tokens": 148058194.0,
+      "reward": 1.4455358982086182,
+      "reward_std": 0.15577620267868042,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.395729124546051,
+      "step": 1271
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1330.0,
+      "completions/max_terminated_length": 1330.0,
+      "completions/mean_length": 494.96429443359375,
+      "completions/mean_terminated_length": 494.96429443359375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.3126128449832344,
+      "grad_norm": 0.7396094799041748,
+      "kl": 0.09814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0198,
+      "num_tokens": 148179325.0,
+      "reward": 1.4459823369979858,
+      "reward_std": 0.15884001553058624,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4459821879863739,
+      "rewards/curriculum_aware_reward_fn/std": 0.37567588686943054,
+      "step": 1272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 475.3750305175781,
+      "completions/mean_terminated_length": 475.3750305175781,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 1.3136445705442352,
+      "grad_norm": 0.6397110223770142,
+      "kl": 0.1038818359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0005,
+      "num_tokens": 148296400.0,
+      "reward": 1.3174108266830444,
+      "reward_std": 0.1539250612258911,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31741073727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.3694840669631958,
+      "step": 1273
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1061.0,
+      "completions/max_terminated_length": 1061.0,
+      "completions/mean_length": 471.8839416503906,
+      "completions/mean_terminated_length": 471.8839416503906,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 1.314676296105236,
+      "grad_norm": 0.7454695105552673,
+      "kl": 0.1060791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0187,
+      "num_tokens": 148415576.0,
+      "reward": 1.4093750715255737,
+      "reward_std": 0.15189561247825623,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40937501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.37299203872680664,
+      "step": 1274
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1387.0,
+      "completions/max_terminated_length": 1387.0,
+      "completions/mean_length": 460.95538330078125,
+      "completions/mean_terminated_length": 460.95538330078125,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 1.3157080216662367,
+      "grad_norm": 0.821291446685791,
+      "kl": 0.112548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0056,
+      "num_tokens": 148525149.0,
+      "reward": 1.3482143878936768,
+      "reward_std": 0.18040023744106293,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.35967057943344116,
+      "step": 1275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 758.0,
+      "completions/max_terminated_length": 758.0,
+      "completions/mean_length": 428.7946472167969,
+      "completions/mean_terminated_length": 428.7946472167969,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 1.3167397472272375,
+      "grad_norm": 0.6388067007064819,
+      "kl": 0.1075439453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0167,
+      "num_tokens": 148635502.0,
+      "reward": 1.252678632736206,
+      "reward_std": 0.12519048154354095,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25267860293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.33408740162849426,
+      "step": 1276
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1192.0,
+      "completions/max_terminated_length": 1192.0,
+      "completions/mean_length": 510.9910888671875,
+      "completions/mean_terminated_length": 510.9910888671875,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.3177714727882384,
+      "grad_norm": 0.7216881513595581,
+      "kl": 0.093017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0177,
+      "num_tokens": 148759681.0,
+      "reward": 1.368303656578064,
+      "reward_std": 0.17615720629692078,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3683035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.3455636203289032,
+      "step": 1277
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 850.0,
+      "completions/max_terminated_length": 850.0,
+      "completions/mean_length": 467.982177734375,
+      "completions/mean_terminated_length": 467.982177734375,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.3188031983492392,
+      "grad_norm": 0.7484327554702759,
+      "kl": 0.1060791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 148878686.0,
+      "reward": 1.4361608028411865,
+      "reward_std": 0.16896887123584747,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43616074323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.36243733763694763,
+      "step": 1278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1004.0,
+      "completions/max_terminated_length": 1004.0,
+      "completions/mean_length": 487.6339416503906,
+      "completions/mean_terminated_length": 487.6339416503906,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 1.3198349239102398,
+      "grad_norm": 0.8039454221725464,
+      "kl": 0.0941162109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0016,
+      "num_tokens": 149004316.0,
+      "reward": 1.2625000476837158,
+      "reward_std": 0.20210020244121552,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27142855525016785,
+      "rewards/curriculum_aware_reward_fn/std": 0.34684643149375916,
+      "step": 1279
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1047.0,
+      "completions/max_terminated_length": 1047.0,
+      "completions/mean_length": 504.482177734375,
+      "completions/mean_terminated_length": 504.482177734375,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 1.3208666494712407,
+      "grad_norm": 0.7807563543319702,
+      "kl": 0.0994873046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0062,
+      "num_tokens": 149136866.0,
+      "reward": 1.3790180683135986,
+      "reward_std": 0.14714425802230835,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3790178596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.36096954345703125,
+      "step": 1280
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 858.0,
+      "completions/max_terminated_length": 858.0,
+      "completions/mean_length": 489.857177734375,
+      "completions/mean_terminated_length": 489.857177734375,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 1.3218983750322415,
+      "grad_norm": 0.7423040866851807,
+      "kl": 0.1202392578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0131,
+      "num_tokens": 149258818.0,
+      "reward": 1.2906250953674316,
+      "reward_std": 0.13313591480255127,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29062503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.3379543125629425,
+      "step": 1281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 874.0,
+      "completions/max_terminated_length": 874.0,
+      "completions/mean_length": 459.3482360839844,
+      "completions/mean_terminated_length": 459.3482360839844,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.3229301005932421,
+      "grad_norm": 0.6583361625671387,
+      "kl": 0.1065673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0134,
+      "num_tokens": 149381917.0,
+      "reward": 1.5464287996292114,
+      "reward_std": 0.14643734693527222,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5553571581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.4766794741153717,
+      "step": 1282
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 916.0,
+      "completions/max_terminated_length": 916.0,
+      "completions/mean_length": 461.607177734375,
+      "completions/mean_terminated_length": 461.607177734375,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.323961826154243,
+      "grad_norm": 0.7041372656822205,
+      "kl": 0.1014404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0032,
+      "num_tokens": 149501832.0,
+      "reward": 1.3616071939468384,
+      "reward_std": 0.14573663473129272,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3616071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3674223721027374,
+      "step": 1283
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 797.0,
+      "completions/max_terminated_length": 797.0,
+      "completions/mean_length": 456.83038330078125,
+      "completions/mean_terminated_length": 456.83038330078125,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 1.3249935517152438,
+      "grad_norm": 0.8035464882850647,
+      "kl": 0.106689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0209,
+      "num_tokens": 149624103.0,
+      "reward": 1.3138394355773926,
+      "reward_std": 0.1782553344964981,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31383928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.36712971329689026,
+      "step": 1284
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1754.0,
+      "completions/max_terminated_length": 1754.0,
+      "completions/mean_length": 497.27679443359375,
+      "completions/mean_terminated_length": 497.27679443359375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 1.3260252772762446,
+      "grad_norm": 0.7910950779914856,
+      "kl": 0.104248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0027,
+      "num_tokens": 149746137.0,
+      "reward": 1.4767857789993286,
+      "reward_std": 0.20006127655506134,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.38636261224746704,
+      "step": 1285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 852.0,
+      "completions/max_terminated_length": 852.0,
+      "completions/mean_length": 486.9107360839844,
+      "completions/mean_terminated_length": 486.9107360839844,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 1.3270570028372453,
+      "grad_norm": 0.7017124891281128,
+      "kl": 0.09912109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0061,
+      "num_tokens": 149873590.0,
+      "reward": 1.3361608982086182,
+      "reward_std": 0.19810323417186737,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.37405797839164734,
+      "step": 1286
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 479.70538330078125,
+      "completions/mean_terminated_length": 479.70538330078125,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 1.328088728398246,
+      "grad_norm": 0.7664874792098999,
+      "kl": 0.115478515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 149994230.0,
+      "reward": 1.247321605682373,
+      "reward_std": 0.1274891495704651,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24732144176959991,
+      "rewards/curriculum_aware_reward_fn/std": 0.3254822790622711,
+      "step": 1287
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 475.7410888671875,
+      "completions/mean_terminated_length": 475.7410888671875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.3291204539592467,
+      "grad_norm": 0.6920973062515259,
+      "kl": 0.1043701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 150116085.0,
+      "reward": 1.2580358982086182,
+      "reward_std": 0.18575353920459747,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25803571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3629554510116577,
+      "step": 1288
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1018.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 479.4732360839844,
+      "completions/mean_terminated_length": 479.4732360839844,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.3301521795202476,
+      "grad_norm": 0.7353091239929199,
+      "kl": 0.10546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0163,
+      "num_tokens": 150236993.0,
+      "reward": 1.411607265472412,
+      "reward_std": 0.15262123942375183,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41160717606544495,
+      "rewards/curriculum_aware_reward_fn/std": 0.34129998087882996,
+      "step": 1289
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 804.0,
+      "completions/max_terminated_length": 804.0,
+      "completions/mean_length": 435.1696472167969,
+      "completions/mean_terminated_length": 435.1696472167969,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 1.3311839050812484,
+      "grad_norm": 0.8722867965698242,
+      "kl": 0.120849609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0012,
+      "num_tokens": 150353093.0,
+      "reward": 1.4982144832611084,
+      "reward_std": 0.22456486523151398,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3938106894493103,
+      "step": 1290
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1175.0,
+      "completions/max_terminated_length": 1175.0,
+      "completions/mean_length": 529.232177734375,
+      "completions/mean_terminated_length": 529.232177734375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.3322156306422492,
+      "grad_norm": 0.7795112729072571,
+      "kl": 0.104248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0012,
+      "num_tokens": 150481030.0,
+      "reward": 1.3160717487335205,
+      "reward_std": 0.19670888781547546,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31607145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.3604435920715332,
+      "step": 1291
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 961.0,
+      "completions/max_terminated_length": 961.0,
+      "completions/mean_length": 482.20538330078125,
+      "completions/mean_terminated_length": 482.20538330078125,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.3332473562032499,
+      "grad_norm": 0.9019424915313721,
+      "kl": 0.1171875,
+      "learning_rate": 1e-06,
+      "loss": -0.008,
+      "num_tokens": 150599815.0,
+      "reward": 1.4316965341567993,
+      "reward_std": 0.195232093334198,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43169641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.37156811356544495,
+      "step": 1292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 812.0,
+      "completions/max_terminated_length": 812.0,
+      "completions/mean_length": 448.6607360839844,
+      "completions/mean_terminated_length": 448.6607360839844,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 1.3342790817642507,
+      "grad_norm": 0.7510018944740295,
+      "kl": 0.1064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 150720086.0,
+      "reward": 1.4589285850524902,
+      "reward_std": 0.16231584548950195,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4589286148548126,
+      "rewards/curriculum_aware_reward_fn/std": 0.3895720839500427,
+      "step": 1293
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1511.0,
+      "completions/max_terminated_length": 1511.0,
+      "completions/mean_length": 477.8035888671875,
+      "completions/mean_terminated_length": 477.8035888671875,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 1.3353108073252515,
+      "grad_norm": 0.8627240061759949,
+      "kl": 0.1124267578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0039,
+      "num_tokens": 150837174.0,
+      "reward": 1.305803656578064,
+      "reward_std": 0.21191909909248352,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3058035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.34227287769317627,
+      "step": 1294
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 923.0,
+      "completions/max_terminated_length": 923.0,
+      "completions/mean_length": 456.26788330078125,
+      "completions/mean_terminated_length": 456.26788330078125,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.3363425328862522,
+      "grad_norm": 0.7269176840782166,
+      "kl": 0.0921630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 150953355.0,
+      "reward": 1.4727680683135986,
+      "reward_std": 0.15642473101615906,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4727678596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.39654409885406494,
+      "step": 1295
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3380.0,
+      "completions/max_terminated_length": 3380.0,
+      "completions/mean_length": 513.0982666015625,
+      "completions/mean_terminated_length": 513.0982666015625,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.337374258447253,
+      "grad_norm": 0.8845184445381165,
+      "kl": 0.1019287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0001,
+      "num_tokens": 151073815.0,
+      "reward": 1.3383928537368774,
+      "reward_std": 0.23095576465129852,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33839288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3643445670604706,
+      "step": 1296
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 976.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 451.6964416503906,
+      "completions/mean_terminated_length": 451.6964416503906,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 1.3384059840082538,
+      "grad_norm": 0.6694154739379883,
+      "kl": 0.103759765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0129,
+      "num_tokens": 151188469.0,
+      "reward": 1.4058037996292114,
+      "reward_std": 0.14767718315124512,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4058035910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.393399715423584,
+      "step": 1297
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1242.0,
+      "completions/max_terminated_length": 1242.0,
+      "completions/mean_length": 481.83929443359375,
+      "completions/mean_terminated_length": 481.83929443359375,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.3394377095692547,
+      "grad_norm": 0.6722558736801147,
+      "kl": 0.1217041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 151309159.0,
+      "reward": 1.244642972946167,
+      "reward_std": 0.12828920781612396,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25357145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.35110601782798767,
+      "step": 1298
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1044.0,
+      "completions/max_terminated_length": 1044.0,
+      "completions/mean_length": 483.9732360839844,
+      "completions/mean_terminated_length": 483.9732360839844,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 1.3404694351302553,
+      "grad_norm": 0.6597891449928284,
+      "kl": 0.10498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 151437193.0,
+      "reward": 1.2696430683135986,
+      "reward_std": 0.10995526611804962,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2696428596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.3576926290988922,
+      "step": 1299
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 489.1785888671875,
+      "completions/mean_terminated_length": 489.1785888671875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.3415011606912561,
+      "grad_norm": 0.797222375869751,
+      "kl": 0.1046142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0018,
+      "num_tokens": 151562002.0,
+      "reward": 1.3330358266830444,
+      "reward_std": 0.2373420149087906,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3330357074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.3669053614139557,
+      "step": 1300
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 978.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 487.14288330078125,
+      "completions/mean_terminated_length": 487.14288330078125,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 1.342532886252257,
+      "grad_norm": 0.6237560510635376,
+      "kl": 0.1055908203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0111,
+      "num_tokens": 151688235.0,
+      "reward": 1.2852680683135986,
+      "reward_std": 0.11172077059745789,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2852678596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.34910687804222107,
+      "step": 1301
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1502.0,
+      "completions/max_terminated_length": 1502.0,
+      "completions/mean_length": 582.1517944335938,
+      "completions/mean_terminated_length": 582.1517944335938,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.3435646118132576,
+      "grad_norm": 0.7618681192398071,
+      "kl": 0.1015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0461,
+      "num_tokens": 151826656.0,
+      "reward": 1.3089287281036377,
+      "reward_std": 0.21061131358146667,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30892854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3483690619468689,
+      "step": 1302
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1174.0,
+      "completions/max_terminated_length": 1174.0,
+      "completions/mean_length": 497.8839416503906,
+      "completions/mean_terminated_length": 497.8839416503906,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 1.3445963373742584,
+      "grad_norm": 0.6274468898773193,
+      "kl": 0.1064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0267,
+      "num_tokens": 151958187.0,
+      "reward": 1.3544644117355347,
+      "reward_std": 0.139055073261261,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3544642925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.458530455827713,
+      "step": 1303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1093.0,
+      "completions/max_terminated_length": 1093.0,
+      "completions/mean_length": 448.7500305175781,
+      "completions/mean_terminated_length": 448.7500305175781,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 1.3456280629352593,
+      "grad_norm": 0.6114272475242615,
+      "kl": 0.09912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 152069830.0,
+      "reward": 1.3660715818405151,
+      "reward_std": 0.10887427628040314,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3660714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3791416585445404,
+      "step": 1304
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1100.0,
+      "completions/max_terminated_length": 1100.0,
+      "completions/mean_length": 453.52679443359375,
+      "completions/mean_terminated_length": 453.52679443359375,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 1.3466597884962601,
+      "grad_norm": 0.6221665143966675,
+      "kl": 0.108154296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0138,
+      "num_tokens": 152193285.0,
+      "reward": 1.4495537281036377,
+      "reward_std": 0.13579247891902924,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44955357909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.3779684603214264,
+      "step": 1305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2482.0,
+      "completions/max_terminated_length": 2482.0,
+      "completions/mean_length": 480.6875305175781,
+      "completions/mean_terminated_length": 480.6875305175781,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 1.3476915140572607,
+      "grad_norm": 0.5587440729141235,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0149,
+      "num_tokens": 152313748.0,
+      "reward": 1.4629465341567993,
+      "reward_std": 0.15617386996746063,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46294644474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.3960244953632355,
+      "step": 1306
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1261.0,
+      "completions/mean_length": 517.6964721679688,
+      "completions/mean_terminated_length": 485.45947265625,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.3487232396182616,
+      "grad_norm": 0.8069753050804138,
+      "kl": 0.1019287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0397,
+      "num_tokens": 152449374.0,
+      "reward": 1.364285945892334,
+      "reward_std": 0.24357934296131134,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.3882773816585541,
+      "step": 1307
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 461.08929443359375,
+      "completions/mean_terminated_length": 461.08929443359375,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 1.3497549651792622,
+      "grad_norm": 0.8312681317329407,
+      "kl": 0.0958251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 152566480.0,
+      "reward": 1.3772321939468384,
+      "reward_std": 0.23976361751556396,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3772321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3572220504283905,
+      "step": 1308
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1349.0,
+      "completions/max_terminated_length": 1349.0,
+      "completions/mean_length": 493.3214416503906,
+      "completions/mean_terminated_length": 493.3214416503906,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 1.350786690740263,
+      "grad_norm": 0.600620448589325,
+      "kl": 0.102294921875,
+      "learning_rate": 1e-06,
+      "loss": -0.002,
+      "num_tokens": 152701007.0,
+      "reward": 1.3272322416305542,
+      "reward_std": 0.1470688432455063,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.35235437750816345,
+      "step": 1309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1064.0,
+      "completions/max_terminated_length": 1064.0,
+      "completions/mean_length": 469.58038330078125,
+      "completions/mean_terminated_length": 469.58038330078125,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 1.3518184163012639,
+      "grad_norm": 0.742341935634613,
+      "kl": 0.1064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0068,
+      "num_tokens": 152823208.0,
+      "reward": 1.5727680921554565,
+      "reward_std": 0.16378778219223022,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5727678537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.34300529956817627,
+      "step": 1310
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1022.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 480.3125305175781,
+      "completions/mean_terminated_length": 480.3125305175781,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 1.3528501418622647,
+      "grad_norm": 0.7505616545677185,
+      "kl": 0.106201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 152942537.0,
+      "reward": 1.2919644117355347,
+      "reward_std": 0.14375154674053192,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2919642925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.3463161289691925,
+      "step": 1311
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1018.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 489.8482360839844,
+      "completions/mean_terminated_length": 489.8482360839844,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 1.3538818674232653,
+      "grad_norm": 0.7036014795303345,
+      "kl": 0.0926513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.019,
+      "num_tokens": 153058076.0,
+      "reward": 1.3107143640518188,
+      "reward_std": 0.13862745463848114,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3107142746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.37677082419395447,
+      "step": 1312
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1097.0,
+      "completions/max_terminated_length": 1097.0,
+      "completions/mean_length": 491.419677734375,
+      "completions/mean_terminated_length": 491.419677734375,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 1.3549135929842662,
+      "grad_norm": 0.6712673902511597,
+      "kl": 0.0941162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0055,
+      "num_tokens": 153177459.0,
+      "reward": 1.2339286804199219,
+      "reward_std": 0.15220916271209717,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24285714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.34379029273986816,
+      "step": 1313
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1090.0,
+      "completions/max_terminated_length": 1090.0,
+      "completions/mean_length": 460.3839416503906,
+      "completions/mean_terminated_length": 460.3839416503906,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.355945318545267,
+      "grad_norm": 0.7762075066566467,
+      "kl": 0.1126708984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0224,
+      "num_tokens": 153296243.0,
+      "reward": 1.4513394832611084,
+      "reward_std": 0.17631617188453674,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4513393044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.38034242391586304,
+      "step": 1314
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1333.0,
+      "completions/max_terminated_length": 1333.0,
+      "completions/mean_length": 469.1250305175781,
+      "completions/mean_terminated_length": 469.1250305175781,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 1.3569770441062676,
+      "grad_norm": 0.690093457698822,
+      "kl": 0.103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0279,
+      "num_tokens": 153412681.0,
+      "reward": 1.372321605682373,
+      "reward_std": 0.23327423632144928,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3812500536441803,
+      "rewards/curriculum_aware_reward_fn/std": 0.40935707092285156,
+      "step": 1315
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1092.0,
+      "completions/max_terminated_length": 1092.0,
+      "completions/mean_length": 470.5714416503906,
+      "completions/mean_terminated_length": 470.5714416503906,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 1.3580087696672685,
+      "grad_norm": 0.7823927998542786,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0137,
+      "num_tokens": 153528133.0,
+      "reward": 1.4848215579986572,
+      "reward_std": 0.2249300479888916,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49375003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.44782379269599915,
+      "step": 1316
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1485.0,
+      "completions/max_terminated_length": 1485.0,
+      "completions/mean_length": 515.232177734375,
+      "completions/mean_terminated_length": 515.232177734375,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 1.3590404952282693,
+      "grad_norm": 0.5873052477836609,
+      "kl": 0.0999755859375,
+      "learning_rate": 1e-06,
+      "loss": -0.014,
+      "num_tokens": 153646361.0,
+      "reward": 1.317857265472412,
+      "reward_std": 0.11958328634500504,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31785717606544495,
+      "rewards/curriculum_aware_reward_fn/std": 0.43799594044685364,
+      "step": 1317
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1250.0,
+      "completions/max_terminated_length": 1250.0,
+      "completions/mean_length": 527.4375,
+      "completions/mean_terminated_length": 527.4375,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 1.3600722207892701,
+      "grad_norm": 2.275315046310425,
+      "kl": 0.24658203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0292,
+      "num_tokens": 153782609.0,
+      "reward": 1.2450894117355347,
+      "reward_std": 0.20867618918418884,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2540178894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.322304368019104,
+      "step": 1318
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1005.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 478.482177734375,
+      "completions/mean_terminated_length": 478.482177734375,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 1.3611039463502708,
+      "grad_norm": 0.6259239315986633,
+      "kl": 0.09228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0235,
+      "num_tokens": 153895538.0,
+      "reward": 1.4031251668930054,
+      "reward_std": 0.17404134571552277,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3928554654121399,
+      "step": 1319
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1701.0,
+      "completions/max_terminated_length": 1701.0,
+      "completions/mean_length": 481.4107360839844,
+      "completions/mean_terminated_length": 481.4107360839844,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 1.3621356719112716,
+      "grad_norm": 0.7713581919670105,
+      "kl": 0.10888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0174,
+      "num_tokens": 154021374.0,
+      "reward": 1.4316965341567993,
+      "reward_std": 0.17943032085895538,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43169644474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.3492588996887207,
+      "step": 1320
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1068.0,
+      "completions/max_terminated_length": 1068.0,
+      "completions/mean_length": 429.2589416503906,
+      "completions/mean_terminated_length": 429.2589416503906,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.3631673974722724,
+      "grad_norm": 0.777759850025177,
+      "kl": 0.1014404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0039,
+      "num_tokens": 154128706.0,
+      "reward": 1.512946605682373,
+      "reward_std": 0.2099122405052185,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.38699474930763245,
+      "step": 1321
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3884.0,
+      "completions/max_terminated_length": 3884.0,
+      "completions/mean_length": 487.7410888671875,
+      "completions/mean_terminated_length": 487.7410888671875,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.364199123033273,
+      "grad_norm": 0.6767941117286682,
+      "kl": 0.09033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0382,
+      "num_tokens": 154249462.0,
+      "reward": 1.4066966772079468,
+      "reward_std": 0.177678644657135,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43348217010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3961747884750366,
+      "step": 1322
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 877.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 474.1160888671875,
+      "completions/mean_terminated_length": 474.1160888671875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.365230848594274,
+      "grad_norm": 0.7261276841163635,
+      "kl": 0.1041259765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0182,
+      "num_tokens": 154372535.0,
+      "reward": 1.375892996788025,
+      "reward_std": 0.20264343917369843,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40267854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.35220280289649963,
+      "step": 1323
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 926.0,
+      "completions/max_terminated_length": 926.0,
+      "completions/mean_length": 462.6250305175781,
+      "completions/mean_terminated_length": 462.6250305175781,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 1.3662625741552747,
+      "grad_norm": 0.7062304019927979,
+      "kl": 0.1004638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0282,
+      "num_tokens": 154494279.0,
+      "reward": 1.3062500953674316,
+      "reward_std": 0.1957845240831375,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33303573727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.37160196900367737,
+      "step": 1324
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1075.0,
+      "completions/max_terminated_length": 1075.0,
+      "completions/mean_length": 476.1964416503906,
+      "completions/mean_terminated_length": 476.1964416503906,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 1.3672942997162756,
+      "grad_norm": 0.7053699493408203,
+      "kl": 0.1025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 154612811.0,
+      "reward": 1.2950894832611084,
+      "reward_std": 0.1937311738729477,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.3728863000869751,
+      "step": 1325
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 938.0,
+      "completions/max_terminated_length": 938.0,
+      "completions/mean_length": 430.9464416503906,
+      "completions/mean_terminated_length": 430.9464416503906,
+      "completions/min_length": 121.0,
+      "completions/min_terminated_length": 121.0,
+      "epoch": 1.3683260252772762,
+      "grad_norm": 0.7252833843231201,
+      "kl": 0.1053466796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0016,
+      "num_tokens": 154728347.0,
+      "reward": 1.4075894355773926,
+      "reward_std": 0.22101444005966187,
+      "rewards/code_format_reward/mean": 0.9553571343421936,
+      "rewards/code_format_reward/std": 0.2074466347694397,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.37820035219192505,
+      "step": 1326
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1116.0,
+      "completions/max_terminated_length": 1116.0,
+      "completions/mean_length": 480.58929443359375,
+      "completions/mean_terminated_length": 480.58929443359375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.369357750838277,
+      "grad_norm": 0.7784614562988281,
+      "kl": 0.095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.051,
+      "num_tokens": 154848064.0,
+      "reward": 1.3196429014205933,
+      "reward_std": 0.24868425726890564,
+      "rewards/code_format_reward/mean": 0.9642857313156128,
+      "rewards/code_format_reward/std": 0.18641091883182526,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3553571403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.3615131974220276,
+      "step": 1327
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1087.0,
+      "completions/max_terminated_length": 1087.0,
+      "completions/mean_length": 450.1875305175781,
+      "completions/mean_terminated_length": 450.1875305175781,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.3703894763992777,
+      "grad_norm": 0.6946715116500854,
+      "kl": 0.09619140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 154974022.0,
+      "reward": 1.3125,
+      "reward_std": 0.17005577683448792,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3303571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3713475167751312,
+      "step": 1328
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 794.0,
+      "completions/max_terminated_length": 794.0,
+      "completions/mean_length": 422.77679443359375,
+      "completions/mean_terminated_length": 422.77679443359375,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 1.3714212019602785,
+      "grad_norm": 0.8444381356239319,
+      "kl": 0.0989990234375,
+      "learning_rate": 1e-06,
+      "loss": -0.003,
+      "num_tokens": 155080432.0,
+      "reward": 1.4718753099441528,
+      "reward_std": 0.23361340165138245,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47187498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3599788248538971,
+      "step": 1329
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1056.0,
+      "completions/max_terminated_length": 1056.0,
+      "completions/mean_length": 485.7589416503906,
+      "completions/mean_terminated_length": 485.7589416503906,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.3724529275212793,
+      "grad_norm": 0.7258499264717102,
+      "kl": 0.100830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0084,
+      "num_tokens": 155199545.0,
+      "reward": 1.2245535850524902,
+      "reward_std": 0.2251826673746109,
+      "rewards/code_format_reward/mean": 0.9464285969734192,
+      "rewards/code_format_reward/std": 0.2261819988489151,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.3506609797477722,
+      "step": 1330
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 422.02679443359375,
+      "completions/mean_terminated_length": 422.02679443359375,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 1.3734846530822802,
+      "grad_norm": 0.8235486745834351,
+      "kl": 0.093994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0185,
+      "num_tokens": 155307754.0,
+      "reward": 1.4330357313156128,
+      "reward_std": 0.22114646434783936,
+      "rewards/code_format_reward/mean": 0.9642857313156128,
+      "rewards/code_format_reward/std": 0.18641091883182526,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3736914098262787,
+      "step": 1331
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1188.0,
+      "completions/max_terminated_length": 1188.0,
+      "completions/mean_length": 413.3482360839844,
+      "completions/mean_terminated_length": 413.3482360839844,
+      "completions/min_length": 124.0,
+      "completions/min_terminated_length": 124.0,
+      "epoch": 1.374516378643281,
+      "grad_norm": 0.7121307253837585,
+      "kl": 0.0933837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.007,
+      "num_tokens": 155420871.0,
+      "reward": 1.364732265472412,
+      "reward_std": 0.15526407957077026,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38258931040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.3583439290523529,
+      "step": 1332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1161.0,
+      "completions/max_terminated_length": 1161.0,
+      "completions/mean_length": 467.0982360839844,
+      "completions/mean_terminated_length": 467.0982360839844,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 1.3755481042042816,
+      "grad_norm": 0.756607711315155,
+      "kl": 0.0885009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0375,
+      "num_tokens": 155544243.0,
+      "reward": 1.3258929252624512,
+      "reward_std": 0.21575960516929626,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3526785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.37595757842063904,
+      "step": 1333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1019.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 451.20538330078125,
+      "completions/mean_terminated_length": 451.20538330078125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 1.3765798297652825,
+      "grad_norm": 0.8574563264846802,
+      "kl": 0.0914306640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0005,
+      "num_tokens": 155663308.0,
+      "reward": 1.3325893878936768,
+      "reward_std": 0.1875346153974533,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3415178656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.36291417479515076,
+      "step": 1334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 474.2857360839844,
+      "completions/mean_terminated_length": 474.2857360839844,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.377611555326283,
+      "grad_norm": 0.6797184348106384,
+      "kl": 0.0853271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 155783103.0,
+      "reward": 1.41785728931427,
+      "reward_std": 0.1773865520954132,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42678573727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.37034979462623596,
+      "step": 1335
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 906.0,
+      "completions/max_terminated_length": 906.0,
+      "completions/mean_length": 488.8660888671875,
+      "completions/mean_terminated_length": 488.8660888671875,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 1.378643280887284,
+      "grad_norm": 0.7231094241142273,
+      "kl": 0.08642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 155905203.0,
+      "reward": 1.2906250953674316,
+      "reward_std": 0.19704152643680573,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2995535731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.37062740325927734,
+      "step": 1336
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1065.0,
+      "completions/max_terminated_length": 1065.0,
+      "completions/mean_length": 455.8482360839844,
+      "completions/mean_terminated_length": 455.8482360839844,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.3796750064482848,
+      "grad_norm": 0.7511847019195557,
+      "kl": 0.087646484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0121,
+      "num_tokens": 156016675.0,
+      "reward": 1.3816964626312256,
+      "reward_std": 0.12826259434223175,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3816964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3298235833644867,
+      "step": 1337
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 434.6160888671875,
+      "completions/mean_terminated_length": 434.6160888671875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 1.3807067320092856,
+      "grad_norm": 0.8543606996536255,
+      "kl": 0.099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 156130151.0,
+      "reward": 1.4160715341567993,
+      "reward_std": 0.21647287905216217,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41607144474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.3719448447227478,
+      "step": 1338
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 919.0,
+      "completions/max_terminated_length": 919.0,
+      "completions/mean_length": 414.26788330078125,
+      "completions/mean_terminated_length": 414.26788330078125,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 1.3817384575702862,
+      "grad_norm": 0.6648277640342712,
+      "kl": 0.1080322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0143,
+      "num_tokens": 156242916.0,
+      "reward": 1.3767858743667603,
+      "reward_std": 0.13667474687099457,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3857143223285675,
+      "rewards/curriculum_aware_reward_fn/std": 0.3733479380607605,
+      "step": 1339
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 477.77679443359375,
+      "completions/mean_terminated_length": 477.77679443359375,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.382770183131287,
+      "grad_norm": 0.825429379940033,
+      "kl": 0.1053466796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0274,
+      "num_tokens": 156362318.0,
+      "reward": 1.3816965818405151,
+      "reward_std": 0.21259619295597076,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3816964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.35336196422576904,
+      "step": 1340
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 840.0,
+      "completions/max_terminated_length": 840.0,
+      "completions/mean_length": 447.26788330078125,
+      "completions/mean_terminated_length": 447.26788330078125,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.383801908692288,
+      "grad_norm": 0.601068913936615,
+      "kl": 0.0947265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0024,
+      "num_tokens": 156476644.0,
+      "reward": 1.2200894355773926,
+      "reward_std": 0.0771571695804596,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22008930146694183,
+      "rewards/curriculum_aware_reward_fn/std": 0.2974713146686554,
+      "step": 1341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1030.0,
+      "completions/max_terminated_length": 1030.0,
+      "completions/mean_length": 388.5357360839844,
+      "completions/mean_terminated_length": 388.5357360839844,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 1.3848336342532885,
+      "grad_norm": 0.8562108278274536,
+      "kl": 0.10888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0165,
+      "num_tokens": 156588501.0,
+      "reward": 1.4767860174179077,
+      "reward_std": 0.20885738730430603,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3956366181373596,
+      "step": 1342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 389.3125305175781,
+      "completions/mean_terminated_length": 389.3125305175781,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 1.3858653598142894,
+      "grad_norm": 0.7280513048171997,
+      "kl": 0.1011962890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0069,
+      "num_tokens": 156687411.0,
+      "reward": 1.4861608743667603,
+      "reward_std": 0.10716202110052109,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4861607551574707,
+      "rewards/curriculum_aware_reward_fn/std": 0.35956743359565735,
+      "step": 1343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 374.2500305175781,
+      "completions/mean_terminated_length": 374.2500305175781,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "epoch": 1.3868970853752902,
+      "grad_norm": 0.6822706460952759,
+      "kl": 0.0941162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 156793832.0,
+      "reward": 1.3946430683135986,
+      "reward_std": 0.10583854466676712,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3946428894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.3630673289299011,
+      "step": 1344
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 864.0,
+      "completions/max_terminated_length": 864.0,
+      "completions/mean_length": 436.58929443359375,
+      "completions/mean_terminated_length": 436.58929443359375,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 1.387928810936291,
+      "grad_norm": 0.7481700778007507,
+      "kl": 0.0899658203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0189,
+      "num_tokens": 156897414.0,
+      "reward": 1.3848215341567993,
+      "reward_std": 0.13752786815166473,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39375001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.35780394077301025,
+      "step": 1345
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1382.0,
+      "completions/max_terminated_length": 1382.0,
+      "completions/mean_length": 446.3482360839844,
+      "completions/mean_terminated_length": 446.3482360839844,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 1.3889605364972917,
+      "grad_norm": 0.6150304675102234,
+      "kl": 0.097412109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0227,
+      "num_tokens": 157013835.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.09968876093626022,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3187500536441803,
+      "rewards/curriculum_aware_reward_fn/std": 0.3391912281513214,
+      "step": 1346
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1066.0,
+      "completions/max_terminated_length": 1066.0,
+      "completions/mean_length": 403.6875305175781,
+      "completions/mean_terminated_length": 403.6875305175781,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.3899922620582925,
+      "grad_norm": 0.7923887968063354,
+      "kl": 0.1036376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0086,
+      "num_tokens": 157126682.0,
+      "reward": 1.3732144832611084,
+      "reward_std": 0.19832755625247955,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3821428418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.36698535084724426,
+      "step": 1347
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1755.0,
+      "completions/max_terminated_length": 1755.0,
+      "completions/mean_length": 398.4821472167969,
+      "completions/mean_terminated_length": 398.4821472167969,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 1.3910239876192931,
+      "grad_norm": 0.6704386472702026,
+      "kl": 0.094482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0231,
+      "num_tokens": 157235243.0,
+      "reward": 1.4232144355773926,
+      "reward_std": 0.13495533168315887,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42321428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3870033025741577,
+      "step": 1348
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 779.0,
+      "completions/max_terminated_length": 779.0,
+      "completions/mean_length": 407.6250305175781,
+      "completions/mean_terminated_length": 407.6250305175781,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 1.392055713180294,
+      "grad_norm": 0.9023550152778625,
+      "kl": 0.106689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0333,
+      "num_tokens": 157347544.0,
+      "reward": 1.4281251430511475,
+      "reward_std": 0.22871477901935577,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4370536208152771,
+      "rewards/curriculum_aware_reward_fn/std": 0.3764907717704773,
+      "step": 1349
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 755.0,
+      "completions/max_terminated_length": 755.0,
+      "completions/mean_length": 392.39288330078125,
+      "completions/mean_terminated_length": 392.39288330078125,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 1.3930874387412948,
+      "grad_norm": 0.9493203163146973,
+      "kl": 0.1072998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0169,
+      "num_tokens": 157461688.0,
+      "reward": 1.2642858028411865,
+      "reward_std": 0.24542173743247986,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26428571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.38463911414146423,
+      "step": 1350
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 977.0,
+      "completions/max_terminated_length": 977.0,
+      "completions/mean_length": 435.6696472167969,
+      "completions/mean_terminated_length": 435.6696472167969,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.3941191643022957,
+      "grad_norm": 0.7578283548355103,
+      "kl": 0.1094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 157583220.0,
+      "reward": 1.3258929252624512,
+      "reward_std": 0.1972636580467224,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3258928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.34731805324554443,
+      "step": 1351
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 857.0,
+      "completions/max_terminated_length": 857.0,
+      "completions/mean_length": 434.26788330078125,
+      "completions/mean_terminated_length": 434.26788330078125,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 1.3951508898632965,
+      "grad_norm": 0.8210800290107727,
+      "kl": 0.095947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0344,
+      "num_tokens": 157699639.0,
+      "reward": 1.305803656578064,
+      "reward_std": 0.2395620495080948,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3236607015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.37548527121543884,
+      "step": 1352
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 711.0,
+      "completions/max_terminated_length": 711.0,
+      "completions/mean_length": 416.3750305175781,
+      "completions/mean_terminated_length": 416.3750305175781,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.3961826154242971,
+      "grad_norm": 0.9273058772087097,
+      "kl": 0.1324462890625,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 157818737.0,
+      "reward": 1.391517996788025,
+      "reward_std": 0.1689910590648651,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39151787757873535,
+      "rewards/curriculum_aware_reward_fn/std": 0.373073935508728,
+      "step": 1353
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 701.0,
+      "completions/max_terminated_length": 701.0,
+      "completions/mean_length": 407.8750305175781,
+      "completions/mean_terminated_length": 407.8750305175781,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 1.397214340985298,
+      "grad_norm": 0.7501819729804993,
+      "kl": 0.113525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0045,
+      "num_tokens": 157928371.0,
+      "reward": 1.3727679252624512,
+      "reward_std": 0.12758342921733856,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3727678954601288,
+      "rewards/curriculum_aware_reward_fn/std": 0.3793005049228668,
+      "step": 1354
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 807.0,
+      "completions/max_terminated_length": 807.0,
+      "completions/mean_length": 402.8571472167969,
+      "completions/mean_terminated_length": 402.8571472167969,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 1.3982460665462986,
+      "grad_norm": 0.6688710451126099,
+      "kl": 0.0921630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 158044940.0,
+      "reward": 1.4031250476837158,
+      "reward_std": 0.13010619580745697,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.379976361989975,
+      "step": 1355
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 787.0,
+      "completions/max_terminated_length": 787.0,
+      "completions/mean_length": 431.5446472167969,
+      "completions/mean_terminated_length": 431.5446472167969,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 1.3992777921072994,
+      "grad_norm": 0.6437855362892151,
+      "kl": 0.0950927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 158162041.0,
+      "reward": 1.3906251192092896,
+      "reward_std": 0.09287998080253601,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.390625,
+      "rewards/curriculum_aware_reward_fn/std": 0.343506783246994,
+      "step": 1356
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 647.0,
+      "completions/max_terminated_length": 647.0,
+      "completions/mean_length": 401.14288330078125,
+      "completions/mean_terminated_length": 401.14288330078125,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.4003095176683003,
+      "grad_norm": 0.7588181495666504,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0142,
+      "num_tokens": 158274603.0,
+      "reward": 1.3504464626312256,
+      "reward_std": 0.17711101472377777,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3504464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.46820029616355896,
+      "step": 1357
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 709.0,
+      "completions/max_terminated_length": 709.0,
+      "completions/mean_length": 370.2857360839844,
+      "completions/mean_terminated_length": 370.2857360839844,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.401341243229301,
+      "grad_norm": 0.6538251042366028,
+      "kl": 0.097900390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0165,
+      "num_tokens": 158379432.0,
+      "reward": 1.4915181398391724,
+      "reward_std": 0.13546860218048096,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4915178716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.40757957100868225,
+      "step": 1358
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 996.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 385.46429443359375,
+      "completions/mean_terminated_length": 385.46429443359375,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 1.4023729687903017,
+      "grad_norm": 0.7940965890884399,
+      "kl": 0.09423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 158490742.0,
+      "reward": 1.4316965341567993,
+      "reward_std": 0.18486927449703217,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44062504172325134,
+      "rewards/curriculum_aware_reward_fn/std": 0.34987330436706543,
+      "step": 1359
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 748.0,
+      "completions/max_terminated_length": 748.0,
+      "completions/mean_length": 388.4196472167969,
+      "completions/mean_terminated_length": 388.4196472167969,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 1.4034046943513026,
+      "grad_norm": 0.7965719103813171,
+      "kl": 0.097412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 158591554.0,
+      "reward": 1.341071605682373,
+      "reward_std": 0.14536845684051514,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3410714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.3188672363758087,
+      "step": 1360
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 945.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 412.6607360839844,
+      "completions/mean_terminated_length": 412.6607360839844,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 1.4044364199123034,
+      "grad_norm": 0.7682105302810669,
+      "kl": 0.0946044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0178,
+      "num_tokens": 158705056.0,
+      "reward": 1.4508929252624512,
+      "reward_std": 0.18029843270778656,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4508928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3541887402534485,
+      "step": 1361
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 866.0,
+      "completions/max_terminated_length": 866.0,
+      "completions/mean_length": 387.6875305175781,
+      "completions/mean_terminated_length": 387.6875305175781,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 1.405468145473304,
+      "grad_norm": 0.7575867772102356,
+      "kl": 0.0928955078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0044,
+      "num_tokens": 158816097.0,
+      "reward": 1.4593751430511475,
+      "reward_std": 0.1781318336725235,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4593750536441803,
+      "rewards/curriculum_aware_reward_fn/std": 0.3494223952293396,
+      "step": 1362
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 582.0,
+      "completions/max_terminated_length": 582.0,
+      "completions/mean_length": 353.4732360839844,
+      "completions/mean_terminated_length": 353.4732360839844,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 1.4064998710343048,
+      "grad_norm": 0.8406264185905457,
+      "kl": 0.107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 158921983.0,
+      "reward": 1.5031250715255737,
+      "reward_std": 0.1740131974220276,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.503125011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.3575619161128998,
+      "step": 1363
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 851.0,
+      "completions/max_terminated_length": 851.0,
+      "completions/mean_length": 404.2857360839844,
+      "completions/mean_terminated_length": 404.2857360839844,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.4075315965953057,
+      "grad_norm": 0.8518685102462769,
+      "kl": 0.095947265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0241,
+      "num_tokens": 159041176.0,
+      "reward": 1.344642996788025,
+      "reward_std": 0.21171006560325623,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34464284777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.3280681073665619,
+      "step": 1364
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1481.0,
+      "completions/max_terminated_length": 1481.0,
+      "completions/mean_length": 394.3125305175781,
+      "completions/mean_terminated_length": 394.3125305175781,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 1.4085633221563065,
+      "grad_norm": 0.7855452299118042,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0064,
+      "num_tokens": 159145959.0,
+      "reward": 1.4522322416305542,
+      "reward_std": 0.13949735462665558,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.3876696527004242,
+      "step": 1365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 652.0,
+      "completions/max_terminated_length": 652.0,
+      "completions/mean_length": 366.3125305175781,
+      "completions/mean_terminated_length": 366.3125305175781,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 1.4095950477173071,
+      "grad_norm": 0.9100239872932434,
+      "kl": 0.104736328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0092,
+      "num_tokens": 159255796.0,
+      "reward": 1.4200893640518188,
+      "reward_std": 0.18777905404567719,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4200893044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.34520265460014343,
+      "step": 1366
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 377.65179443359375,
+      "completions/mean_terminated_length": 377.65179443359375,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 1.410626773278308,
+      "grad_norm": 0.708103597164154,
+      "kl": 0.0955810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0245,
+      "num_tokens": 159361478.0,
+      "reward": 1.5017858743667603,
+      "reward_std": 0.17366045713424683,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5017856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.3746426999568939,
+      "step": 1367
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1215.0,
+      "completions/max_terminated_length": 1215.0,
+      "completions/mean_length": 455.45538330078125,
+      "completions/mean_terminated_length": 455.45538330078125,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 1.4116584988393086,
+      "grad_norm": 0.5828267931938171,
+      "kl": 0.0880126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0,
+      "num_tokens": 159481446.0,
+      "reward": 1.2611607313156128,
+      "reward_std": 0.0732208862900734,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2611607313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.3630892336368561,
+      "step": 1368
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 892.0,
+      "completions/max_terminated_length": 892.0,
+      "completions/mean_length": 419.1875305175781,
+      "completions/mean_terminated_length": 419.1875305175781,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 1.4126902244003094,
+      "grad_norm": 0.7837874889373779,
+      "kl": 0.089599609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0019,
+      "num_tokens": 159592760.0,
+      "reward": 1.344642996788025,
+      "reward_std": 0.20070511102676392,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34464284777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.3375425934791565,
+      "step": 1369
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 874.0,
+      "completions/max_terminated_length": 874.0,
+      "completions/mean_length": 392.95538330078125,
+      "completions/mean_terminated_length": 392.95538330078125,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.4137219499613103,
+      "grad_norm": 0.798795759677887,
+      "kl": 0.0869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0139,
+      "num_tokens": 159702879.0,
+      "reward": 1.3674108982086182,
+      "reward_std": 0.1692216545343399,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3445005714893341,
+      "step": 1370
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 894.0,
+      "completions/max_terminated_length": 894.0,
+      "completions/mean_length": 386.5714416503906,
+      "completions/mean_terminated_length": 386.5714416503906,
+      "completions/min_length": 123.0,
+      "completions/min_terminated_length": 123.0,
+      "epoch": 1.4147536755223111,
+      "grad_norm": 0.8209942579269409,
+      "kl": 0.1168212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 159817316.0,
+      "reward": 1.5406252145767212,
+      "reward_std": 0.20728908479213715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5406250357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.35523951053619385,
+      "step": 1371
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1228.0,
+      "completions/max_terminated_length": 1228.0,
+      "completions/mean_length": 438.1160888671875,
+      "completions/mean_terminated_length": 438.1160888671875,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 1.415785401083312,
+      "grad_norm": 0.7205373048782349,
+      "kl": 0.0789794921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0351,
+      "num_tokens": 159934901.0,
+      "reward": 1.4044643640518188,
+      "reward_std": 0.14802031219005585,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4044643044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3880857229232788,
+      "step": 1372
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 469.4285888671875,
+      "completions/mean_terminated_length": 469.4285888671875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.4168171266443126,
+      "grad_norm": 0.6454539895057678,
+      "kl": 0.0826416015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 160056315.0,
+      "reward": 1.274553656578064,
+      "reward_std": 0.1375679075717926,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2745535969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.3422892987728119,
+      "step": 1373
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 696.0,
+      "completions/max_terminated_length": 696.0,
+      "completions/mean_length": 410.90179443359375,
+      "completions/mean_terminated_length": 410.90179443359375,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.4178488522053134,
+      "grad_norm": 0.717077910900116,
+      "kl": 0.09716796875,
+      "learning_rate": 1e-06,
+      "loss": -0.014,
+      "num_tokens": 160174940.0,
+      "reward": 1.2781251668930054,
+      "reward_std": 0.1405002921819687,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27812501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3249934911727905,
+      "step": 1374
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 673.0,
+      "completions/max_terminated_length": 673.0,
+      "completions/mean_length": 376.40179443359375,
+      "completions/mean_terminated_length": 376.40179443359375,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 1.418880577766314,
+      "grad_norm": 0.691324770450592,
+      "kl": 0.0902099609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0082,
+      "num_tokens": 160274924.0,
+      "reward": 1.5562502145767212,
+      "reward_std": 0.1841638833284378,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5562499761581421,
+      "rewards/curriculum_aware_reward_fn/std": 0.3889883756637573,
+      "step": 1375
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1306.0,
+      "completions/max_terminated_length": 1306.0,
+      "completions/mean_length": 423.4464416503906,
+      "completions/mean_terminated_length": 423.4464416503906,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.4199123033273149,
+      "grad_norm": 0.7552855014801025,
+      "kl": 0.091064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0103,
+      "num_tokens": 160389459.0,
+      "reward": 1.520535945892334,
+      "reward_std": 0.20544345676898956,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5294643044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.42688655853271484,
+      "step": 1376
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 991.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 408.2589416503906,
+      "completions/mean_terminated_length": 408.2589416503906,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 1.4209440288883157,
+      "grad_norm": 0.7593014240264893,
+      "kl": 0.1038818359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 160513997.0,
+      "reward": 1.4910715818405151,
+      "reward_std": 0.18334133923053741,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5,
+      "rewards/curriculum_aware_reward_fn/std": 0.3415650427341461,
+      "step": 1377
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 988.0,
+      "completions/max_terminated_length": 988.0,
+      "completions/mean_length": 415.4732360839844,
+      "completions/mean_terminated_length": 415.4732360839844,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 1.4219757544493166,
+      "grad_norm": 0.7567901611328125,
+      "kl": 0.09130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0109,
+      "num_tokens": 160629711.0,
+      "reward": 1.3964285850524902,
+      "reward_std": 0.17067018151283264,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39642858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.38471439480781555,
+      "step": 1378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1413.0,
+      "completions/max_terminated_length": 1413.0,
+      "completions/mean_length": 431.8660888671875,
+      "completions/mean_terminated_length": 431.8660888671875,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.4230074800103172,
+      "grad_norm": 0.7414162755012512,
+      "kl": 0.0986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 160742557.0,
+      "reward": 1.4924108982086182,
+      "reward_std": 0.20008407533168793,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.501339316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.3692249059677124,
+      "step": 1379
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1042.0,
+      "completions/max_terminated_length": 1042.0,
+      "completions/mean_length": 417.33929443359375,
+      "completions/mean_terminated_length": 417.33929443359375,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 1.424039205571318,
+      "grad_norm": 0.6366227269172668,
+      "kl": 0.1085205078125,
+      "learning_rate": 1e-06,
+      "loss": -0.002,
+      "num_tokens": 160861168.0,
+      "reward": 1.3656251430511475,
+      "reward_std": 0.1291547268629074,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3656250536441803,
+      "rewards/curriculum_aware_reward_fn/std": 0.37990227341651917,
+      "step": 1380
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1041.0,
+      "completions/max_terminated_length": 1041.0,
+      "completions/mean_length": 429.7589416503906,
+      "completions/mean_terminated_length": 429.7589416503906,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 1.4250709311323189,
+      "grad_norm": 0.6410421133041382,
+      "kl": 0.08984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 160983718.0,
+      "reward": 1.6232143640518188,
+      "reward_std": 0.15141195058822632,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6232143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.44402584433555603,
+      "step": 1381
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 969.0,
+      "completions/max_terminated_length": 969.0,
+      "completions/mean_length": 452.4375305175781,
+      "completions/mean_terminated_length": 452.4375305175781,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.4261026566933195,
+      "grad_norm": 0.7187079191207886,
+      "kl": 0.095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0019,
+      "num_tokens": 161103110.0,
+      "reward": 1.3169643878936768,
+      "reward_std": 0.1875334084033966,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3169642984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3558743894100189,
+      "step": 1382
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1380.0,
+      "completions/max_terminated_length": 1380.0,
+      "completions/mean_length": 431.7857360839844,
+      "completions/mean_terminated_length": 431.7857360839844,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.4271343822543203,
+      "grad_norm": 0.7007352113723755,
+      "kl": 0.0953369140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0002,
+      "num_tokens": 161225565.0,
+      "reward": 1.3937500715255737,
+      "reward_std": 0.17483645677566528,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3934214115142822,
+      "step": 1383
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1214.0,
+      "completions/max_terminated_length": 1214.0,
+      "completions/mean_length": 397.39288330078125,
+      "completions/mean_terminated_length": 397.39288330078125,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.4281661078153212,
+      "grad_norm": 0.7081319093704224,
+      "kl": 0.1026611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 161332072.0,
+      "reward": 1.4406250715255737,
+      "reward_std": 0.1809663474559784,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44062501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.36702674627304077,
+      "step": 1384
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1568.0,
+      "completions/max_terminated_length": 1568.0,
+      "completions/mean_length": 461.1785888671875,
+      "completions/mean_terminated_length": 461.1785888671875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 1.429197833376322,
+      "grad_norm": 0.6438872218132019,
+      "kl": 0.089599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 161447200.0,
+      "reward": 1.4156250953674316,
+      "reward_std": 0.21118290722370148,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41562503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.3949587643146515,
+      "step": 1385
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1009.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 438.0000305175781,
+      "completions/mean_terminated_length": 438.0000305175781,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 1.4302295589373226,
+      "grad_norm": 0.6846404671669006,
+      "kl": 0.092529296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0005,
+      "num_tokens": 161560831.0,
+      "reward": 1.426785945892334,
+      "reward_std": 0.23958255350589752,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45357146859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.3801797926425934,
+      "step": 1386
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1300.0,
+      "completions/max_terminated_length": 1300.0,
+      "completions/mean_length": 520.7142944335938,
+      "completions/mean_terminated_length": 520.7142944335938,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 1.4312612844983235,
+      "grad_norm": 0.7028166055679321,
+      "kl": 0.0889892578125,
+      "learning_rate": 1e-06,
+      "loss": -0.01,
+      "num_tokens": 161683208.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.16449230909347534,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.35235437750816345,
+      "step": 1387
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 830.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 384.2232360839844,
+      "completions/mean_terminated_length": 384.2232360839844,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.4322930100593243,
+      "grad_norm": 0.6343652009963989,
+      "kl": 0.0977783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 161788184.0,
+      "reward": 1.3741072416305542,
+      "reward_std": 0.13007023930549622,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37410715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.40089890360832214,
+      "step": 1388
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 891.0,
+      "completions/max_terminated_length": 891.0,
+      "completions/mean_length": 403.0000305175781,
+      "completions/mean_terminated_length": 403.0000305175781,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.433324735620325,
+      "grad_norm": 0.7477560043334961,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0181,
+      "num_tokens": 161900632.0,
+      "reward": 1.3531250953674316,
+      "reward_std": 0.20165258646011353,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36205360293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.37807273864746094,
+      "step": 1389
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 891.0,
+      "completions/max_terminated_length": 891.0,
+      "completions/mean_length": 436.08929443359375,
+      "completions/mean_terminated_length": 436.08929443359375,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.4343564611813258,
+      "grad_norm": 0.6067072749137878,
+      "kl": 0.0994873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0227,
+      "num_tokens": 162016317.0,
+      "reward": 1.2183036804199219,
+      "reward_std": 0.20316217839717865,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22723214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3544370234012604,
+      "step": 1390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3818.0,
+      "completions/max_terminated_length": 3818.0,
+      "completions/mean_length": 489.7500305175781,
+      "completions/mean_terminated_length": 489.7500305175781,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 1.4353881867423266,
+      "grad_norm": 0.679914116859436,
+      "kl": 0.0909423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0185,
+      "num_tokens": 162136227.0,
+      "reward": 1.3629463911056519,
+      "reward_std": 0.2041923701763153,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36294645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.3852427303791046,
+      "step": 1391
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 948.0,
+      "completions/max_terminated_length": 948.0,
+      "completions/mean_length": 435.76788330078125,
+      "completions/mean_terminated_length": 435.76788330078125,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.4364199123033274,
+      "grad_norm": 0.7845681309700012,
+      "kl": 0.09423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0101,
+      "num_tokens": 162252772.0,
+      "reward": 1.2741073369979858,
+      "reward_std": 0.20398284494876862,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2830357253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.3526957631111145,
+      "step": 1392
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 938.0,
+      "completions/max_terminated_length": 938.0,
+      "completions/mean_length": 471.3125305175781,
+      "completions/mean_terminated_length": 471.3125305175781,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.437451637864328,
+      "grad_norm": 0.6001479029655457,
+      "kl": 0.086669921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0066,
+      "num_tokens": 162384463.0,
+      "reward": 1.3040179014205933,
+      "reward_std": 0.153373122215271,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3040178418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.3661424517631531,
+      "step": 1393
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 792.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 421.71429443359375,
+      "completions/mean_terminated_length": 421.71429443359375,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.438483363425329,
+      "grad_norm": 0.6689612865447998,
+      "kl": 0.10498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0016,
+      "num_tokens": 162498069.0,
+      "reward": 1.424553632736206,
+      "reward_std": 0.1970020830631256,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42455360293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.402972936630249,
+      "step": 1394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1096.0,
+      "completions/max_terminated_length": 1096.0,
+      "completions/mean_length": 452.1250305175781,
+      "completions/mean_terminated_length": 452.1250305175781,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 1.4395150889863295,
+      "grad_norm": 0.7905884385108948,
+      "kl": 0.09716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0027,
+      "num_tokens": 162618476.0,
+      "reward": 1.2879464626312256,
+      "reward_std": 0.248353973031044,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3422423005104065,
+      "step": 1395
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1315.0,
+      "completions/max_terminated_length": 1315.0,
+      "completions/mean_length": 457.4107360839844,
+      "completions/mean_terminated_length": 457.4107360839844,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.4405468145473304,
+      "grad_norm": 0.7530561685562134,
+      "kl": 0.099853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0038,
+      "num_tokens": 162738894.0,
+      "reward": 1.3598215579986572,
+      "reward_std": 0.1974332481622696,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35982146859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.3718010187149048,
+      "step": 1396
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1701.0,
+      "completions/max_terminated_length": 1701.0,
+      "completions/mean_length": 487.3482360839844,
+      "completions/mean_terminated_length": 487.3482360839844,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 1.4415785401083312,
+      "grad_norm": 0.6047545671463013,
+      "kl": 0.085205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0158,
+      "num_tokens": 162868454.0,
+      "reward": 1.2325893640518188,
+      "reward_std": 0.15268686413764954,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24151787161827087,
+      "rewards/curriculum_aware_reward_fn/std": 0.33280736207962036,
+      "step": 1397
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 745.0,
+      "completions/max_terminated_length": 745.0,
+      "completions/mean_length": 429.1160888671875,
+      "completions/mean_terminated_length": 429.1160888671875,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 1.442610265669332,
+      "grad_norm": 0.5883904099464417,
+      "kl": 0.0916748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0152,
+      "num_tokens": 162982602.0,
+      "reward": 1.4254467487335205,
+      "reward_std": 0.14535757899284363,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42544645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.381674587726593,
+      "step": 1398
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1189.0,
+      "completions/max_terminated_length": 1189.0,
+      "completions/mean_length": 412.71429443359375,
+      "completions/mean_terminated_length": 412.71429443359375,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 1.4436419912303327,
+      "grad_norm": 0.6786606907844543,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0324,
+      "num_tokens": 163090993.0,
+      "reward": 1.4593751430511475,
+      "reward_std": 0.16988961398601532,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4683035910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.37626636028289795,
+      "step": 1399
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 771.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 406.5446472167969,
+      "completions/mean_terminated_length": 406.5446472167969,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.4446737167913335,
+      "grad_norm": 0.6487584710121155,
+      "kl": 0.0985107421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0094,
+      "num_tokens": 163202774.0,
+      "reward": 1.5339287519454956,
+      "reward_std": 0.1595512330532074,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5339285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.36305850744247437,
+      "step": 1400
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1228.0,
+      "completions/max_terminated_length": 1228.0,
+      "completions/mean_length": 429.4196472167969,
+      "completions/mean_terminated_length": 429.4196472167969,
+      "completions/min_length": 134.0,
+      "completions/min_terminated_length": 134.0,
+      "epoch": 1.4457054423523343,
+      "grad_norm": 0.6628140807151794,
+      "kl": 0.0904541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0303,
+      "num_tokens": 163320715.0,
+      "reward": 1.5446429252624512,
+      "reward_std": 0.20707891881465912,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5535714626312256,
+      "rewards/curriculum_aware_reward_fn/std": 0.3792307674884796,
+      "step": 1401
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1015.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 475.26788330078125,
+      "completions/mean_terminated_length": 475.26788330078125,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.446737167913335,
+      "grad_norm": 0.7178117036819458,
+      "kl": 0.0933837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0262,
+      "num_tokens": 163448845.0,
+      "reward": 1.4919644594192505,
+      "reward_std": 0.18081454932689667,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5008928775787354,
+      "rewards/curriculum_aware_reward_fn/std": 0.47267603874206543,
+      "step": 1402
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1358.0,
+      "completions/max_terminated_length": 1358.0,
+      "completions/mean_length": 540.919677734375,
+      "completions/mean_terminated_length": 540.919677734375,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 1.4477688934743358,
+      "grad_norm": 0.6339001655578613,
+      "kl": 0.080322265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0023,
+      "num_tokens": 163579764.0,
+      "reward": 1.2714287042617798,
+      "reward_std": 0.18801215291023254,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27142858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.32959944009780884,
+      "step": 1403
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 945.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 511.232177734375,
+      "completions/mean_terminated_length": 511.232177734375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.4488006190353366,
+      "grad_norm": 0.7088522911071777,
+      "kl": 0.1148681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 163706686.0,
+      "reward": 1.3120536804199219,
+      "reward_std": 0.15960483253002167,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3209821879863739,
+      "rewards/curriculum_aware_reward_fn/std": 0.3356879651546478,
+      "step": 1404
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 960.0,
+      "completions/max_terminated_length": 960.0,
+      "completions/mean_length": 489.6785888671875,
+      "completions/mean_terminated_length": 489.6785888671875,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 1.4498323445963375,
+      "grad_norm": 0.7049396634101868,
+      "kl": 0.08203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0121,
+      "num_tokens": 163831845.0,
+      "reward": 1.4357143640518188,
+      "reward_std": 0.17000097036361694,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4357143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3761126697063446,
+      "step": 1405
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1513.0,
+      "completions/max_terminated_length": 1513.0,
+      "completions/mean_length": 482.6785888671875,
+      "completions/mean_terminated_length": 482.6785888671875,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 1.450864070157338,
+      "grad_norm": 0.6695181131362915,
+      "kl": 0.0791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0071,
+      "num_tokens": 163948163.0,
+      "reward": 1.3866074085235596,
+      "reward_std": 0.19195009768009186,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38660717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.35080230236053467,
+      "step": 1406
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 959.0,
+      "completions/max_terminated_length": 959.0,
+      "completions/mean_length": 460.14288330078125,
+      "completions/mean_terminated_length": 460.14288330078125,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 1.451895795718339,
+      "grad_norm": 0.5940765738487244,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0042,
+      "num_tokens": 164069804.0,
+      "reward": 1.3875001668930054,
+      "reward_std": 0.15338915586471558,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38750001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.41359224915504456,
+      "step": 1407
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1256.0,
+      "completions/max_terminated_length": 1256.0,
+      "completions/mean_length": 469.4285888671875,
+      "completions/mean_terminated_length": 469.4285888671875,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 1.4529275212793398,
+      "grad_norm": 0.681818425655365,
+      "kl": 0.0908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0186,
+      "num_tokens": 164202390.0,
+      "reward": 1.4303573369979858,
+      "reward_std": 0.13860322535037994,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4303571581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.3885962963104248,
+      "step": 1408
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1762.0,
+      "completions/max_terminated_length": 1762.0,
+      "completions/mean_length": 476.3125305175781,
+      "completions/mean_terminated_length": 476.3125305175781,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 1.4539592468403404,
+      "grad_norm": 0.6444785594940186,
+      "kl": 0.089599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 164331365.0,
+      "reward": 1.282589316368103,
+      "reward_std": 0.15449091792106628,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28258928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.36091160774230957,
+      "step": 1409
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1121.0,
+      "completions/max_terminated_length": 1121.0,
+      "completions/mean_length": 510.232177734375,
+      "completions/mean_terminated_length": 510.232177734375,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 1.4549909724013412,
+      "grad_norm": 0.791489839553833,
+      "kl": 0.0924072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 164450006.0,
+      "reward": 1.372321605682373,
+      "reward_std": 0.25465095043182373,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3812499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3428894877433777,
+      "step": 1410
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1113.0,
+      "completions/max_terminated_length": 1113.0,
+      "completions/mean_length": 483.7500305175781,
+      "completions/mean_terminated_length": 483.7500305175781,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 1.456022697962342,
+      "grad_norm": 0.6734673380851746,
+      "kl": 0.091064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0032,
+      "num_tokens": 164572702.0,
+      "reward": 1.3053573369979858,
+      "reward_std": 0.18237224221229553,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3142857253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.37195783853530884,
+      "step": 1411
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1091.0,
+      "completions/max_terminated_length": 1091.0,
+      "completions/mean_length": 478.58038330078125,
+      "completions/mean_terminated_length": 478.58038330078125,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 1.457054423523343,
+      "grad_norm": 0.6534205079078674,
+      "kl": 0.0865478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0181,
+      "num_tokens": 164696282.0,
+      "reward": 1.3339285850524902,
+      "reward_std": 0.18611732125282288,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33392858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3818644881248474,
+      "step": 1412
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1331.0,
+      "completions/max_terminated_length": 1331.0,
+      "completions/mean_length": 470.33929443359375,
+      "completions/mean_terminated_length": 470.33929443359375,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 1.4580861490843435,
+      "grad_norm": 0.5302649736404419,
+      "kl": 0.097412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0051,
+      "num_tokens": 164828620.0,
+      "reward": 1.3053573369979858,
+      "reward_std": 0.11551646143198013,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3053571581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.353448748588562,
+      "step": 1413
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1141.0,
+      "completions/max_terminated_length": 1141.0,
+      "completions/mean_length": 500.6964416503906,
+      "completions/mean_terminated_length": 500.6964416503906,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 1.4591178746453444,
+      "grad_norm": 0.7022354006767273,
+      "kl": 0.0821533203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0102,
+      "num_tokens": 164960242.0,
+      "reward": 1.3616071939468384,
+      "reward_std": 0.2165532261133194,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3705357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.30473393201828003,
+      "step": 1414
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 958.0,
+      "completions/max_terminated_length": 958.0,
+      "completions/mean_length": 404.1250305175781,
+      "completions/mean_terminated_length": 404.1250305175781,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 1.460149600206345,
+      "grad_norm": 0.6341533660888672,
+      "kl": 0.0887451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0222,
+      "num_tokens": 165070009.0,
+      "reward": 1.4808037281036377,
+      "reward_std": 0.12062705308198929,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48080354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3547137975692749,
+      "step": 1415
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1158.0,
+      "completions/max_terminated_length": 1158.0,
+      "completions/mean_length": 447.89288330078125,
+      "completions/mean_terminated_length": 447.89288330078125,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.4611813257673458,
+      "grad_norm": 0.7164600491523743,
+      "kl": 0.0855712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0359,
+      "num_tokens": 165180520.0,
+      "reward": 1.4535716772079468,
+      "reward_std": 0.21277686953544617,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45357146859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.3822476267814636,
+      "step": 1416
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 924.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 476.9285888671875,
+      "completions/mean_terminated_length": 476.9285888671875,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 1.4622130513283467,
+      "grad_norm": 0.6117580533027649,
+      "kl": 0.088623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0184,
+      "num_tokens": 165299372.0,
+      "reward": 1.5316965579986572,
+      "reward_std": 0.20724934339523315,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5316964387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.481925368309021,
+      "step": 1417
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1294.0,
+      "completions/mean_length": 559.8392944335938,
+      "completions/mean_terminated_length": 527.9819946289062,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.4632447768893475,
+      "grad_norm": 0.6708288192749023,
+      "kl": 0.07763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0337,
+      "num_tokens": 165434265.0,
+      "reward": 1.3084824085235596,
+      "reward_std": 0.2409285604953766,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31741073727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.3392300605773926,
+      "step": 1418
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1047.0,
+      "completions/max_terminated_length": 1047.0,
+      "completions/mean_length": 420.4464416503906,
+      "completions/mean_terminated_length": 420.4464416503906,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 1.4642765024503481,
+      "grad_norm": 0.6302593350410461,
+      "kl": 0.093505859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 165555926.0,
+      "reward": 1.479017972946167,
+      "reward_std": 0.13766011595726013,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47901788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3736018240451813,
+      "step": 1419
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1119.0,
+      "completions/max_terminated_length": 1119.0,
+      "completions/mean_length": 491.419677734375,
+      "completions/mean_terminated_length": 491.419677734375,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 1.465308228011349,
+      "grad_norm": 0.7415760159492493,
+      "kl": 0.10595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0227,
+      "num_tokens": 165684052.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.19681335985660553,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36250001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.37395501136779785,
+      "step": 1420
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 870.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 429.0446472167969,
+      "completions/mean_terminated_length": 429.0446472167969,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "epoch": 1.4663399535723498,
+      "grad_norm": 0.8263067603111267,
+      "kl": 0.1051025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0181,
+      "num_tokens": 165805031.0,
+      "reward": 1.393303632736206,
+      "reward_std": 0.20443706214427948,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39330360293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.3297211229801178,
+      "step": 1421
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1190.0,
+      "completions/max_terminated_length": 1190.0,
+      "completions/mean_length": 465.71429443359375,
+      "completions/mean_terminated_length": 465.71429443359375,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.4673716791333504,
+      "grad_norm": 0.7716458439826965,
+      "kl": 0.09326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 165916145.0,
+      "reward": 1.3147321939468384,
+      "reward_std": 0.222005695104599,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3147321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.34021976590156555,
+      "step": 1422
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1123.0,
+      "completions/max_terminated_length": 1123.0,
+      "completions/mean_length": 482.8125305175781,
+      "completions/mean_terminated_length": 482.8125305175781,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 1.4684034046943513,
+      "grad_norm": 0.7555353045463562,
+      "kl": 0.120849609375,
+      "learning_rate": 1e-06,
+      "loss": 0.043,
+      "num_tokens": 166035684.0,
+      "reward": 1.2803572416305542,
+      "reward_std": 0.16637490689754486,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28035715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.34979772567749023,
+      "step": 1423
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1108.0,
+      "completions/max_terminated_length": 1108.0,
+      "completions/mean_length": 457.77679443359375,
+      "completions/mean_terminated_length": 457.77679443359375,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 1.469435130255352,
+      "grad_norm": 0.6749049425125122,
+      "kl": 0.080810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0325,
+      "num_tokens": 166155016.0,
+      "reward": 1.395982265472412,
+      "reward_std": 0.1725309193134308,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39598217606544495,
+      "rewards/curriculum_aware_reward_fn/std": 0.36447781324386597,
+      "step": 1424
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 674.0,
+      "completions/max_terminated_length": 674.0,
+      "completions/mean_length": 401.27679443359375,
+      "completions/mean_terminated_length": 401.27679443359375,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 1.470466855816353,
+      "grad_norm": 0.7543728351593018,
+      "kl": 0.0909423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0089,
+      "num_tokens": 166257408.0,
+      "reward": 1.5339287519454956,
+      "reward_std": 0.2014743983745575,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5428571105003357,
+      "rewards/curriculum_aware_reward_fn/std": 0.36630964279174805,
+      "step": 1425
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 978.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 441.357177734375,
+      "completions/mean_terminated_length": 441.357177734375,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 1.4714985813773536,
+      "grad_norm": 0.7767645120620728,
+      "kl": 0.1064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0038,
+      "num_tokens": 166378646.0,
+      "reward": 1.4120537042617798,
+      "reward_std": 0.21909524500370026,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3727935552597046,
+      "step": 1426
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 833.0,
+      "completions/max_terminated_length": 833.0,
+      "completions/mean_length": 426.33929443359375,
+      "completions/mean_terminated_length": 426.33929443359375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 1.4725303069383544,
+      "grad_norm": 0.5623325109481812,
+      "kl": 0.0906982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.023,
+      "num_tokens": 166488206.0,
+      "reward": 1.4102680683135986,
+      "reward_std": 0.12621724605560303,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4102678596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.3717585802078247,
+      "step": 1427
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1199.0,
+      "completions/mean_length": 536.044677734375,
+      "completions/mean_terminated_length": 503.9729919433594,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 1.4735620324993552,
+      "grad_norm": 0.5052303671836853,
+      "kl": 0.076171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 166611845.0,
+      "reward": 1.4236608743667603,
+      "reward_std": 0.14933934807777405,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4325893223285675,
+      "rewards/curriculum_aware_reward_fn/std": 0.37994247674942017,
+      "step": 1428
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 853.0,
+      "completions/max_terminated_length": 853.0,
+      "completions/mean_length": 410.89288330078125,
+      "completions/mean_terminated_length": 410.89288330078125,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.4745937580603559,
+      "grad_norm": 0.6693888306617737,
+      "kl": 0.095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0297,
+      "num_tokens": 166724099.0,
+      "reward": 1.462053656578064,
+      "reward_std": 0.18026289343833923,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4620535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.36590293049812317,
+      "step": 1429
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 465.9107360839844,
+      "completions/mean_terminated_length": 465.9107360839844,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 1.4756254836213567,
+      "grad_norm": 0.598745584487915,
+      "kl": 0.0985107421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0226,
+      "num_tokens": 166836033.0,
+      "reward": 1.3633930683135986,
+      "reward_std": 0.129484161734581,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3633928894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.3225030303001404,
+      "step": 1430
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1022.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 403.76788330078125,
+      "completions/mean_terminated_length": 403.76788330078125,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.4766572091823575,
+      "grad_norm": 0.864762544631958,
+      "kl": 0.10986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0353,
+      "num_tokens": 166952648.0,
+      "reward": 1.456696629524231,
+      "reward_std": 0.20940305292606354,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45669645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.37026259303092957,
+      "step": 1431
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1019.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 422.0714416503906,
+      "completions/mean_terminated_length": 422.0714416503906,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.4776889347433584,
+      "grad_norm": 0.7573868036270142,
+      "kl": 0.092529296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0251,
+      "num_tokens": 167074190.0,
+      "reward": 1.4294644594192505,
+      "reward_std": 0.20020684599876404,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43839287757873535,
+      "rewards/curriculum_aware_reward_fn/std": 0.37476611137390137,
+      "step": 1432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 944.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 469.2500305175781,
+      "completions/mean_terminated_length": 469.2500305175781,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 1.478720660304359,
+      "grad_norm": 0.7239991426467896,
+      "kl": 0.0771484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0155,
+      "num_tokens": 167188346.0,
+      "reward": 1.3848215341567993,
+      "reward_std": 0.1801411509513855,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3647240698337555,
+      "step": 1433
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 947.0,
+      "completions/max_terminated_length": 947.0,
+      "completions/mean_length": 429.77679443359375,
+      "completions/mean_terminated_length": 429.77679443359375,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 1.4797523858653598,
+      "grad_norm": 0.7500477433204651,
+      "kl": 0.0946044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 167298809.0,
+      "reward": 1.3700894117355347,
+      "reward_std": 0.16702376306056976,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3700892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.356184720993042,
+      "step": 1434
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 934.0,
+      "completions/max_terminated_length": 934.0,
+      "completions/mean_length": 457.64288330078125,
+      "completions/mean_terminated_length": 457.64288330078125,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 1.4807841114263605,
+      "grad_norm": 0.6691403388977051,
+      "kl": 0.0888671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0075,
+      "num_tokens": 167418070.0,
+      "reward": 1.2071428298950195,
+      "reward_std": 0.1764199137687683,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20714284479618073,
+      "rewards/curriculum_aware_reward_fn/std": 0.30349743366241455,
+      "step": 1435
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 847.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 471.919677734375,
+      "completions/mean_terminated_length": 471.919677734375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.4818158369873613,
+      "grad_norm": 0.7403013110160828,
+      "kl": 0.08349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 167546714.0,
+      "reward": 1.3919644355773926,
+      "reward_std": 0.19533541798591614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39196428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3597768247127533,
+      "step": 1436
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 955.0,
+      "completions/max_terminated_length": 955.0,
+      "completions/mean_length": 429.7410888671875,
+      "completions/mean_terminated_length": 429.7410888671875,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 1.4828475625483621,
+      "grad_norm": 0.6121811866760254,
+      "kl": 0.088623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0255,
+      "num_tokens": 167643670.0,
+      "reward": 1.395982265472412,
+      "reward_std": 0.14923794567584991,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39598211646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.3318755328655243,
+      "step": 1437
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 864.0,
+      "completions/max_terminated_length": 864.0,
+      "completions/mean_length": 457.2410888671875,
+      "completions/mean_terminated_length": 457.2410888671875,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.483879288109363,
+      "grad_norm": 0.6797091960906982,
+      "kl": 0.0784912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0265,
+      "num_tokens": 167762984.0,
+      "reward": 1.3156250715255737,
+      "reward_std": 0.18089859187602997,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3589920401573181,
+      "step": 1438
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 838.0,
+      "completions/max_terminated_length": 838.0,
+      "completions/mean_length": 373.7589416503906,
+      "completions/mean_terminated_length": 373.7589416503906,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 1.4849110136703638,
+      "grad_norm": 0.728999674320221,
+      "kl": 0.1011962890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0126,
+      "num_tokens": 167867271.0,
+      "reward": 1.4187501668930054,
+      "reward_std": 0.14087806642055511,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41875001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.37830376625061035,
+      "step": 1439
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1249.0,
+      "completions/max_terminated_length": 1249.0,
+      "completions/mean_length": 489.46429443359375,
+      "completions/mean_terminated_length": 489.46429443359375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.4859427392313644,
+      "grad_norm": 0.6123307347297668,
+      "kl": 0.081787109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 167989445.0,
+      "reward": 1.3000000715255737,
+      "reward_std": 0.1632949411869049,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3484198749065399,
+      "step": 1440
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1670.0,
+      "completions/max_terminated_length": 1670.0,
+      "completions/mean_length": 426.02679443359375,
+      "completions/mean_terminated_length": 426.02679443359375,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "epoch": 1.4869744647923653,
+      "grad_norm": 0.7175002098083496,
+      "kl": 0.084228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0049,
+      "num_tokens": 168097430.0,
+      "reward": 1.4450894594192505,
+      "reward_std": 0.20554915070533752,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.3961503803730011,
+      "step": 1441
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 412.8214416503906,
+      "completions/mean_terminated_length": 412.8214416503906,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 1.488006190353366,
+      "grad_norm": 0.7144314646720886,
+      "kl": 0.091064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0084,
+      "num_tokens": 168203138.0,
+      "reward": 1.520535945892334,
+      "reward_std": 0.18925592303276062,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5205357074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.3949885368347168,
+      "step": 1442
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 955.0,
+      "completions/max_terminated_length": 955.0,
+      "completions/mean_length": 444.64288330078125,
+      "completions/mean_terminated_length": 444.64288330078125,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.4890379159143667,
+      "grad_norm": 0.5935328602790833,
+      "kl": 0.0838623046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0021,
+      "num_tokens": 168323057.0,
+      "reward": 1.3866074085235596,
+      "reward_std": 0.0912431851029396,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.37554657459259033,
+      "step": 1443
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 892.0,
+      "completions/max_terminated_length": 892.0,
+      "completions/mean_length": 399.7500305175781,
+      "completions/mean_terminated_length": 399.7500305175781,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 1.4900696414753676,
+      "grad_norm": 0.5947428345680237,
+      "kl": 0.0933837890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0017,
+      "num_tokens": 168429051.0,
+      "reward": 1.4129464626312256,
+      "reward_std": 0.14674913883209229,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.421875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3914503753185272,
+      "step": 1444
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1437.0,
+      "completions/max_terminated_length": 1437.0,
+      "completions/mean_length": 467.64288330078125,
+      "completions/mean_terminated_length": 467.64288330078125,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 1.4911013670363684,
+      "grad_norm": 0.7172960638999939,
+      "kl": 0.0882568359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0169,
+      "num_tokens": 168550909.0,
+      "reward": 1.3223215341567993,
+      "reward_std": 0.19947509467601776,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32232141494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.345814049243927,
+      "step": 1445
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 771.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 465.83038330078125,
+      "completions/mean_terminated_length": 465.83038330078125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 1.492133092597369,
+      "grad_norm": 0.7651225924491882,
+      "kl": 0.0965576171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 168679326.0,
+      "reward": 1.2294644117355347,
+      "reward_std": 0.15720400214195251,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22946429252624512,
+      "rewards/curriculum_aware_reward_fn/std": 0.29906412959098816,
+      "step": 1446
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1656.0,
+      "completions/max_terminated_length": 1656.0,
+      "completions/mean_length": 458.1875305175781,
+      "completions/mean_terminated_length": 458.1875305175781,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.4931648181583699,
+      "grad_norm": 0.6342578530311584,
+      "kl": 0.096435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0139,
+      "num_tokens": 168795218.0,
+      "reward": 1.4379466772079468,
+      "reward_std": 0.13654667139053345,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43794646859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.3718256652355194,
+      "step": 1447
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1089.0,
+      "completions/max_terminated_length": 1089.0,
+      "completions/mean_length": 483.2500305175781,
+      "completions/mean_terminated_length": 483.2500305175781,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.4941965437193707,
+      "grad_norm": 0.6862965226173401,
+      "kl": 0.082275390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0112,
+      "num_tokens": 168917333.0,
+      "reward": 1.3508929014205933,
+      "reward_std": 0.18875552713871002,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3508928716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.35754305124282837,
+      "step": 1448
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 751.0,
+      "completions/max_terminated_length": 751.0,
+      "completions/mean_length": 440.3660888671875,
+      "completions/mean_terminated_length": 440.3660888671875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 1.4952282692803713,
+      "grad_norm": 0.6744213104248047,
+      "kl": 0.0999755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0155,
+      "num_tokens": 169033182.0,
+      "reward": 1.3272322416305542,
+      "reward_std": 0.126448854804039,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32723215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.36235520243644714,
+      "step": 1449
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1382.0,
+      "completions/max_terminated_length": 1382.0,
+      "completions/mean_length": 476.20538330078125,
+      "completions/mean_terminated_length": 476.20538330078125,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 1.4962599948413722,
+      "grad_norm": 0.7253400087356567,
+      "kl": 0.0897216796875,
+      "learning_rate": 1e-06,
+      "loss": 0.026,
+      "num_tokens": 169151848.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.18081489205360413,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.37647607922554016,
+      "step": 1450
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 879.0,
+      "completions/max_terminated_length": 879.0,
+      "completions/mean_length": 468.95538330078125,
+      "completions/mean_terminated_length": 468.95538330078125,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.497291720402373,
+      "grad_norm": 0.6010491251945496,
+      "kl": 0.0977783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0119,
+      "num_tokens": 169275946.0,
+      "reward": 1.4964287281036377,
+      "reward_std": 0.15095055103302002,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5053572058677673,
+      "rewards/curriculum_aware_reward_fn/std": 0.3826388418674469,
+      "step": 1451
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 404.5535888671875,
+      "completions/mean_terminated_length": 404.5535888671875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 1.4983234459633739,
+      "grad_norm": 0.6166358590126038,
+      "kl": 0.0943603515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0163,
+      "num_tokens": 169382233.0,
+      "reward": 1.4875000715255737,
+      "reward_std": 0.15182271599769592,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48750001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.37099212408065796,
+      "step": 1452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 738.0,
+      "completions/max_terminated_length": 738.0,
+      "completions/mean_length": 406.71429443359375,
+      "completions/mean_terminated_length": 406.71429443359375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 1.4993551715243745,
+      "grad_norm": 0.7371792197227478,
+      "kl": 0.100341796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0098,
+      "num_tokens": 169485269.0,
+      "reward": 1.5120537281036377,
+      "reward_std": 0.23051097989082336,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5209821462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.36109432578086853,
+      "step": 1453
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1151.0,
+      "completions/max_terminated_length": 1151.0,
+      "completions/mean_length": 465.02679443359375,
+      "completions/mean_terminated_length": 465.02679443359375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.5003868970853753,
+      "grad_norm": 0.6901764869689941,
+      "kl": 0.0924072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0338,
+      "num_tokens": 169602452.0,
+      "reward": 1.266964316368103,
+      "reward_std": 0.1574702113866806,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26696428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.34113970398902893,
+      "step": 1454
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 853.0,
+      "completions/max_terminated_length": 853.0,
+      "completions/mean_length": 432.0625305175781,
+      "completions/mean_terminated_length": 432.0625305175781,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.501418622646376,
+      "grad_norm": 0.809800386428833,
+      "kl": 0.0904541015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0047,
+      "num_tokens": 169714830.0,
+      "reward": 1.3727679252624512,
+      "reward_std": 0.17047983407974243,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3727678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.35481810569763184,
+      "step": 1455
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 666.0,
+      "completions/max_terminated_length": 666.0,
+      "completions/mean_length": 410.2232360839844,
+      "completions/mean_terminated_length": 410.2232360839844,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 1.5024503482073768,
+      "grad_norm": 0.8028994798660278,
+      "kl": 0.109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0107,
+      "num_tokens": 169837551.0,
+      "reward": 1.5437501668930054,
+      "reward_std": 0.18651224672794342,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.543749988079071,
+      "rewards/curriculum_aware_reward_fn/std": 0.38038182258605957,
+      "step": 1456
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 933.0,
+      "completions/max_terminated_length": 933.0,
+      "completions/mean_length": 458.8125305175781,
+      "completions/mean_terminated_length": 458.8125305175781,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 1.5034820737683776,
+      "grad_norm": 0.7478047013282776,
+      "kl": 0.0880126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 169960275.0,
+      "reward": 1.4183037281036377,
+      "reward_std": 0.1733304113149643,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41830354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3492588996887207,
+      "step": 1457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1865.0,
+      "completions/max_terminated_length": 1865.0,
+      "completions/mean_length": 500.0357360839844,
+      "completions/mean_terminated_length": 500.0357360839844,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 1.5045137993293785,
+      "grad_norm": 0.6545856595039368,
+      "kl": 0.0909423828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 170078768.0,
+      "reward": 1.4303573369979858,
+      "reward_std": 0.14356939494609833,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4303571879863739,
+      "rewards/curriculum_aware_reward_fn/std": 0.3660328686237335,
+      "step": 1458
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 804.0,
+      "completions/max_terminated_length": 804.0,
+      "completions/mean_length": 443.9107360839844,
+      "completions/mean_terminated_length": 443.9107360839844,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.5055455248903793,
+      "grad_norm": 0.6835969686508179,
+      "kl": 0.10302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0181,
+      "num_tokens": 170194993.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.2047101855278015,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3665028214454651,
+      "step": 1459
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 793.0,
+      "completions/max_terminated_length": 793.0,
+      "completions/mean_length": 481.107177734375,
+      "completions/mean_terminated_length": 481.107177734375,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 1.50657725045138,
+      "grad_norm": 0.632583737373352,
+      "kl": 0.0914306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 170317680.0,
+      "reward": 1.2718751430511475,
+      "reward_std": 0.1807551085948944,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.29391586780548096,
+      "step": 1460
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 870.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 443.2500305175781,
+      "completions/mean_terminated_length": 443.2500305175781,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 1.5076089760123808,
+      "grad_norm": 0.7975127100944519,
+      "kl": 0.0994873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 170440950.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.15491509437561035,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.351559042930603,
+      "step": 1461
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 955.0,
+      "completions/max_terminated_length": 955.0,
+      "completions/mean_length": 450.3660888671875,
+      "completions/mean_terminated_length": 450.3660888671875,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 1.5086407015733814,
+      "grad_norm": 0.7866511344909668,
+      "kl": 0.0982666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 170561251.0,
+      "reward": 1.4392858743667603,
+      "reward_std": 0.1659286618232727,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4392857253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.3631958067417145,
+      "step": 1462
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 872.0,
+      "completions/max_terminated_length": 872.0,
+      "completions/mean_length": 440.3214416503906,
+      "completions/mean_terminated_length": 440.3214416503906,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.5096724271343822,
+      "grad_norm": 0.7658423781394958,
+      "kl": 0.11376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0465,
+      "num_tokens": 170683612.0,
+      "reward": 1.4495537281036377,
+      "reward_std": 0.23375743627548218,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44955357909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.3733519911766052,
+      "step": 1463
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 876.0,
+      "completions/max_terminated_length": 876.0,
+      "completions/mean_length": 451.2500305175781,
+      "completions/mean_terminated_length": 451.2500305175781,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.510704152695383,
+      "grad_norm": 0.7433845400810242,
+      "kl": 0.095947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0258,
+      "num_tokens": 170803396.0,
+      "reward": 1.3691965341567993,
+      "reward_std": 0.15630899369716644,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36919641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.34888792037963867,
+      "step": 1464
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1013.0,
+      "completions/max_terminated_length": 1013.0,
+      "completions/mean_length": 420.0446472167969,
+      "completions/mean_terminated_length": 420.0446472167969,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 1.511735878256384,
+      "grad_norm": 0.6753164529800415,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0128,
+      "num_tokens": 170915723.0,
+      "reward": 1.4419645071029663,
+      "reward_std": 0.14553149044513702,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.441964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3703877925872803,
+      "step": 1465
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 988.0,
+      "completions/max_terminated_length": 988.0,
+      "completions/mean_length": 465.0982360839844,
+      "completions/mean_terminated_length": 465.0982360839844,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.5127676038173847,
+      "grad_norm": 0.7211045622825623,
+      "kl": 0.0921630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 171031822.0,
+      "reward": 1.3616071939468384,
+      "reward_std": 0.1773756593465805,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3616071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3393145203590393,
+      "step": 1466
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 976.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 433.9375305175781,
+      "completions/mean_terminated_length": 433.9375305175781,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.5137993293783853,
+      "grad_norm": 0.8377368450164795,
+      "kl": 0.0975341796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0254,
+      "num_tokens": 171144655.0,
+      "reward": 1.360267996788025,
+      "reward_std": 0.2237682342529297,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36026787757873535,
+      "rewards/curriculum_aware_reward_fn/std": 0.3718797266483307,
+      "step": 1467
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2812.0,
+      "completions/max_terminated_length": 2812.0,
+      "completions/mean_length": 445.70538330078125,
+      "completions/mean_terminated_length": 445.70538330078125,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 1.514831054939386,
+      "grad_norm": 0.7094552516937256,
+      "kl": 0.0989990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0374,
+      "num_tokens": 171264070.0,
+      "reward": 1.3732144832611084,
+      "reward_std": 0.16332145035266876,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3732143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3847269117832184,
+      "step": 1468
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 418.0625305175781,
+      "completions/mean_terminated_length": 418.0625305175781,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.5158627805003868,
+      "grad_norm": 0.8003520369529724,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0181,
+      "num_tokens": 171364912.0,
+      "reward": 1.3218752145767212,
+      "reward_std": 0.1855483502149582,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.37373098731040955,
+      "step": 1469
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 387.4375305175781,
+      "completions/mean_terminated_length": 387.4375305175781,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.5168945060613876,
+      "grad_norm": 0.7462090849876404,
+      "kl": 0.1134033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0188,
+      "num_tokens": 171464501.0,
+      "reward": 1.5415178537368774,
+      "reward_std": 0.19770653545856476,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.38107770681381226,
+      "step": 1470
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 841.0,
+      "completions/max_terminated_length": 841.0,
+      "completions/mean_length": 464.1875305175781,
+      "completions/mean_terminated_length": 464.1875305175781,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 1.5179262316223885,
+      "grad_norm": 0.6659581065177917,
+      "kl": 0.088623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 171584250.0,
+      "reward": 1.3665181398391724,
+      "reward_std": 0.13865076005458832,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3665178716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.3345959186553955,
+      "step": 1471
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 751.0,
+      "completions/max_terminated_length": 751.0,
+      "completions/mean_length": 454.71429443359375,
+      "completions/mean_terminated_length": 454.71429443359375,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.5189579571833893,
+      "grad_norm": 0.66074538230896,
+      "kl": 0.0784912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 171702518.0,
+      "reward": 1.4281251430511475,
+      "reward_std": 0.09557029604911804,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4281249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3529018461704254,
+      "step": 1472
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3126.0,
+      "completions/max_terminated_length": 3126.0,
+      "completions/mean_length": 476.2500305175781,
+      "completions/mean_terminated_length": 476.2500305175781,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 1.51998968274439,
+      "grad_norm": 0.7866209149360657,
+      "kl": 0.0963134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0541,
+      "num_tokens": 171820139.0,
+      "reward": 1.2075893878936768,
+      "reward_std": 0.26282820105552673,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2254464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.33858445286750793,
+      "step": 1473
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1056.0,
+      "completions/max_terminated_length": 1056.0,
+      "completions/mean_length": 457.89288330078125,
+      "completions/mean_terminated_length": 457.89288330078125,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.5210214083053908,
+      "grad_norm": 0.7619090676307678,
+      "kl": 0.0849609375,
+      "learning_rate": 1e-06,
+      "loss": -0.005,
+      "num_tokens": 171948801.0,
+      "reward": 1.4491074085235596,
+      "reward_std": 0.1694876253604889,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44910717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3511873185634613,
+      "step": 1474
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 965.0,
+      "completions/max_terminated_length": 965.0,
+      "completions/mean_length": 457.0000305175781,
+      "completions/mean_terminated_length": 457.0000305175781,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.5220531338663914,
+      "grad_norm": 0.626873791217804,
+      "kl": 0.0914306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 172072526.0,
+      "reward": 1.40223228931427,
+      "reward_std": 0.1566895991563797,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4111607074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.36537712812423706,
+      "step": 1475
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 938.0,
+      "completions/max_terminated_length": 938.0,
+      "completions/mean_length": 436.4732360839844,
+      "completions/mean_terminated_length": 436.4732360839844,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.5230848594273922,
+      "grad_norm": 0.6276397109031677,
+      "kl": 0.08154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0366,
+      "num_tokens": 172188023.0,
+      "reward": 1.4066966772079468,
+      "reward_std": 0.1416996717453003,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4156250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3868928849697113,
+      "step": 1476
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 480.26788330078125,
+      "completions/mean_terminated_length": 480.26788330078125,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 1.524116584988393,
+      "grad_norm": 0.7986273169517517,
+      "kl": 0.0885009765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0068,
+      "num_tokens": 172305037.0,
+      "reward": 1.3424108028411865,
+      "reward_std": 0.1933390200138092,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35133931040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.3585166931152344,
+      "step": 1477
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 773.0,
+      "completions/max_terminated_length": 773.0,
+      "completions/mean_length": 454.8839416503906,
+      "completions/mean_terminated_length": 454.8839416503906,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.525148310549394,
+      "grad_norm": 0.73015296459198,
+      "kl": 0.0775146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0172,
+      "num_tokens": 172431521.0,
+      "reward": 1.4276787042617798,
+      "reward_std": 0.18670453131198883,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3737344741821289,
+      "step": 1478
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 456.83929443359375,
+      "completions/mean_terminated_length": 456.83929443359375,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 1.5261800361103948,
+      "grad_norm": 0.6959778070449829,
+      "kl": 0.0828857421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0118,
+      "num_tokens": 172552393.0,
+      "reward": 1.33973228931427,
+      "reward_std": 0.18844622373580933,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33973217010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3715161681175232,
+      "step": 1479
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2458.0,
+      "completions/max_terminated_length": 2458.0,
+      "completions/mean_length": 501.2410888671875,
+      "completions/mean_terminated_length": 501.2410888671875,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.5272117616713954,
+      "grad_norm": 0.6077437400817871,
+      "kl": 0.0804443359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0435,
+      "num_tokens": 172678861.0,
+      "reward": 1.3825894594192505,
+      "reward_std": 0.15254563093185425,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38258931040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.3776235580444336,
+      "step": 1480
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 859.0,
+      "completions/max_terminated_length": 859.0,
+      "completions/mean_length": 424.4285888671875,
+      "completions/mean_terminated_length": 424.4285888671875,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 1.5282434872323962,
+      "grad_norm": 0.7133572697639465,
+      "kl": 0.0950927734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0163,
+      "num_tokens": 172785540.0,
+      "reward": 1.5232144594192505,
+      "reward_std": 0.15948817133903503,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5232142806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.3813164234161377,
+      "step": 1481
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 695.0,
+      "completions/max_terminated_length": 695.0,
+      "completions/mean_length": 416.4732360839844,
+      "completions/mean_terminated_length": 416.4732360839844,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.5292752127933968,
+      "grad_norm": 0.8125452399253845,
+      "kl": 0.09814453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0033,
+      "num_tokens": 172902934.0,
+      "reward": 1.425446629524231,
+      "reward_std": 0.16105496883392334,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42544645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.3703538775444031,
+      "step": 1482
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 480.5089416503906,
+      "completions/mean_terminated_length": 480.5089416503906,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 1.5303069383543977,
+      "grad_norm": 0.6588963866233826,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0381,
+      "num_tokens": 173030111.0,
+      "reward": 1.4031251668930054,
+      "reward_std": 0.16077668964862823,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.37097883224487305,
+      "step": 1483
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1053.0,
+      "completions/max_terminated_length": 1053.0,
+      "completions/mean_length": 432.0982360839844,
+      "completions/mean_terminated_length": 432.0982360839844,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.5313386639153985,
+      "grad_norm": 0.7676278352737427,
+      "kl": 0.0838623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0137,
+      "num_tokens": 173144348.0,
+      "reward": 1.6035715341567993,
+      "reward_std": 0.20362050831317902,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6035714149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.4420323073863983,
+      "step": 1484
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 997.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 465.5625305175781,
+      "completions/mean_terminated_length": 465.5625305175781,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 1.5323703894763994,
+      "grad_norm": 0.79965740442276,
+      "kl": 0.08837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0046,
+      "num_tokens": 173260478.0,
+      "reward": 1.3312500715255737,
+      "reward_std": 0.1637294739484787,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33125001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.33498185873031616,
+      "step": 1485
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1007.0,
+      "completions/max_terminated_length": 1007.0,
+      "completions/mean_length": 434.7500305175781,
+      "completions/mean_terminated_length": 434.7500305175781,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.5334021150374002,
+      "grad_norm": 0.7042688131332397,
+      "kl": 0.0911865234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0036,
+      "num_tokens": 173375474.0,
+      "reward": 1.395535945892334,
+      "reward_std": 0.14024481177330017,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39553573727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.33748897910118103,
+      "step": 1486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 746.0,
+      "completions/max_terminated_length": 746.0,
+      "completions/mean_length": 426.90179443359375,
+      "completions/mean_terminated_length": 426.90179443359375,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 1.5344338405984008,
+      "grad_norm": 0.9660199284553528,
+      "kl": 0.1873779296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 173484614.0,
+      "reward": 1.2669644355773926,
+      "reward_std": 0.12665453553199768,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26696428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.34800291061401367,
+      "step": 1487
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 408.71429443359375,
+      "completions/mean_terminated_length": 408.71429443359375,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 1.5354655661594014,
+      "grad_norm": 0.7146320343017578,
+      "kl": 0.089111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 173587853.0,
+      "reward": 1.4758931398391724,
+      "reward_std": 0.14861474931240082,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4758928716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.37781867384910583,
+      "step": 1488
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 827.0,
+      "completions/max_terminated_length": 827.0,
+      "completions/mean_length": 439.3125305175781,
+      "completions/mean_terminated_length": 439.3125305175781,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.5364972917204023,
+      "grad_norm": 0.7023717164993286,
+      "kl": 0.0887451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0079,
+      "num_tokens": 173704438.0,
+      "reward": 1.296875238418579,
+      "reward_std": 0.18027296662330627,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.34681832790374756,
+      "step": 1489
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 851.0,
+      "completions/max_terminated_length": 851.0,
+      "completions/mean_length": 444.6875305175781,
+      "completions/mean_terminated_length": 444.6875305175781,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 1.5375290172814031,
+      "grad_norm": 0.6490708589553833,
+      "kl": 0.1005859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 173825776.0,
+      "reward": 1.3754465579986572,
+      "reward_std": 0.18801386654376984,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37544646859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.38502344489097595,
+      "step": 1490
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 703.0,
+      "completions/max_terminated_length": 703.0,
+      "completions/mean_length": 411.7321472167969,
+      "completions/mean_terminated_length": 411.7321472167969,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "epoch": 1.538560742842404,
+      "grad_norm": 0.7477344870567322,
+      "kl": 0.090576171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0021,
+      "num_tokens": 173934806.0,
+      "reward": 1.4120535850524902,
+      "reward_std": 0.15478430688381195,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.34817707538604736,
+      "step": 1491
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1050.0,
+      "completions/max_terminated_length": 1050.0,
+      "completions/mean_length": 446.9910888671875,
+      "completions/mean_terminated_length": 446.9910888671875,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.5395924684034048,
+      "grad_norm": 0.46143588423728943,
+      "kl": 0.078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0129,
+      "num_tokens": 174051887.0,
+      "reward": 1.3267858028411865,
+      "reward_std": 0.07234279066324234,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32678574323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.36955833435058594,
+      "step": 1492
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 645.0,
+      "completions/max_terminated_length": 645.0,
+      "completions/mean_length": 368.7232360839844,
+      "completions/mean_terminated_length": 368.7232360839844,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 1.5406241939644054,
+      "grad_norm": 0.6577993035316467,
+      "kl": 0.1026611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0003,
+      "num_tokens": 174161896.0,
+      "reward": 1.4656251668930054,
+      "reward_std": 0.16244220733642578,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.36809906363487244,
+      "step": 1493
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1028.0,
+      "completions/max_terminated_length": 1028.0,
+      "completions/mean_length": 428.4910888671875,
+      "completions/mean_terminated_length": 428.4910888671875,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "epoch": 1.5416559195254063,
+      "grad_norm": 0.7847018241882324,
+      "kl": 0.0850830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0062,
+      "num_tokens": 174275722.0,
+      "reward": 1.3973214626312256,
+      "reward_std": 0.24171169102191925,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40625,
+      "rewards/curriculum_aware_reward_fn/std": 0.35375240445137024,
+      "step": 1494
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 700.0,
+      "completions/max_terminated_length": 700.0,
+      "completions/mean_length": 417.96429443359375,
+      "completions/mean_terminated_length": 417.96429443359375,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.5426876450864069,
+      "grad_norm": 0.7562360763549805,
+      "kl": 0.0975341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0024,
+      "num_tokens": 174390273.0,
+      "reward": 1.4120535850524902,
+      "reward_std": 0.18743880093097687,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.37687939405441284,
+      "step": 1495
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1126.0,
+      "completions/max_terminated_length": 1126.0,
+      "completions/mean_length": 473.83929443359375,
+      "completions/mean_terminated_length": 473.83929443359375,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.5437193706474077,
+      "grad_norm": 0.6407991051673889,
+      "kl": 0.07470703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0061,
+      "num_tokens": 174505937.0,
+      "reward": 1.3714287281036377,
+      "reward_std": 0.17726092040538788,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37142854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3861168622970581,
+      "step": 1496
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 814.0,
+      "completions/max_terminated_length": 814.0,
+      "completions/mean_length": 450.4375305175781,
+      "completions/mean_terminated_length": 450.4375305175781,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.5447510962084086,
+      "grad_norm": 0.7203906178474426,
+      "kl": 0.0848388671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0186,
+      "num_tokens": 174631881.0,
+      "reward": 1.4767858982086182,
+      "reward_std": 0.2179284691810608,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48571428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.34986668825149536,
+      "step": 1497
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 829.0,
+      "completions/max_terminated_length": 829.0,
+      "completions/mean_length": 480.9464416503906,
+      "completions/mean_terminated_length": 480.9464416503906,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.5457828217694094,
+      "grad_norm": 0.7707893252372742,
+      "kl": 0.0919189453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0256,
+      "num_tokens": 174765148.0,
+      "reward": 1.2928574085235596,
+      "reward_std": 0.16916993260383606,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29285717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.33556845784187317,
+      "step": 1498
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 760.0,
+      "completions/max_terminated_length": 760.0,
+      "completions/mean_length": 484.5982360839844,
+      "completions/mean_terminated_length": 484.5982360839844,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "epoch": 1.5468145473304102,
+      "grad_norm": 0.7403567433357239,
+      "kl": 0.0938720703125,
+      "learning_rate": 1e-06,
+      "loss": -0.017,
+      "num_tokens": 174886028.0,
+      "reward": 1.250892996788025,
+      "reward_std": 0.17120662331581116,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25089287757873535,
+      "rewards/curriculum_aware_reward_fn/std": 0.32403579354286194,
+      "step": 1499
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 992.0,
+      "completions/max_terminated_length": 992.0,
+      "completions/mean_length": 445.1250305175781,
+      "completions/mean_terminated_length": 445.1250305175781,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.5478462728914109,
+      "grad_norm": 0.7902706265449524,
+      "kl": 0.0830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0348,
+      "num_tokens": 175003271.0,
+      "reward": 1.3799108266830444,
+      "reward_std": 0.2304457426071167,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37991073727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.36151954531669617,
+      "step": 1500
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 745.0,
+      "completions/max_terminated_length": 745.0,
+      "completions/mean_length": 426.1250305175781,
+      "completions/mean_terminated_length": 426.1250305175781,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 1.5488779984524117,
+      "grad_norm": 0.827652096748352,
+      "kl": 0.0955810546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0073,
+      "num_tokens": 175120878.0,
+      "reward": 1.3937503099441528,
+      "reward_std": 0.15829935669898987,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.36342495679855347,
+      "step": 1501
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1138.0,
+      "completions/max_terminated_length": 1138.0,
+      "completions/mean_length": 497.3125305175781,
+      "completions/mean_terminated_length": 497.3125305175781,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 1.5499097240134123,
+      "grad_norm": 0.7216241359710693,
+      "kl": 0.0885009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 175248362.0,
+      "reward": 1.3397324085235596,
+      "reward_std": 0.17447860538959503,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33973217010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.34557294845581055,
+      "step": 1502
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 834.0,
+      "completions/max_terminated_length": 834.0,
+      "completions/mean_length": 429.4732360839844,
+      "completions/mean_terminated_length": 429.4732360839844,
+      "completions/min_length": 120.0,
+      "completions/min_terminated_length": 120.0,
+      "epoch": 1.5509414495744132,
+      "grad_norm": 0.7612575888633728,
+      "kl": 0.0924072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0357,
+      "num_tokens": 175362711.0,
+      "reward": 1.3892858028411865,
+      "reward_std": 0.21689125895500183,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38928574323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.37539342045783997,
+      "step": 1503
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 773.0,
+      "completions/max_terminated_length": 773.0,
+      "completions/mean_length": 450.3125305175781,
+      "completions/mean_terminated_length": 450.3125305175781,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 1.551973175135414,
+      "grad_norm": 0.6470363736152649,
+      "kl": 0.0928955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 175483062.0,
+      "reward": 1.325446605682373,
+      "reward_std": 0.18153639137744904,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3343749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.36724144220352173,
+      "step": 1504
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 796.0,
+      "completions/max_terminated_length": 796.0,
+      "completions/mean_length": 409.8482360839844,
+      "completions/mean_terminated_length": 409.8482360839844,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 1.5530049006964148,
+      "grad_norm": 0.7157784700393677,
+      "kl": 0.0926513671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0058,
+      "num_tokens": 175589690.0,
+      "reward": 1.464285969734192,
+      "reward_std": 0.18744036555290222,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4642857015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.37712928652763367,
+      "step": 1505
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 870.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 429.01788330078125,
+      "completions/mean_terminated_length": 429.01788330078125,
+      "completions/min_length": 94.0,
+      "completions/min_terminated_length": 94.0,
+      "epoch": 1.5540366262574157,
+      "grad_norm": 0.6890504360198975,
+      "kl": 0.0814208984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0115,
+      "num_tokens": 175689708.0,
+      "reward": 1.3250000476837158,
+      "reward_std": 0.15183790028095245,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3779389262199402,
+      "step": 1506
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 987.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 452.9910888671875,
+      "completions/mean_terminated_length": 452.9910888671875,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 1.5550683518184163,
+      "grad_norm": 0.7142993807792664,
+      "kl": 0.08837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0018,
+      "num_tokens": 175803296.0,
+      "reward": 1.3285716772079468,
+      "reward_std": 0.21572059392929077,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3486460745334625,
+      "step": 1507
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1064.0,
+      "completions/max_terminated_length": 1064.0,
+      "completions/mean_length": 449.6160888671875,
+      "completions/mean_terminated_length": 449.6160888671875,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.556100077379417,
+      "grad_norm": 0.743757426738739,
+      "kl": 0.0986328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0195,
+      "num_tokens": 175928840.0,
+      "reward": 1.2982144355773926,
+      "reward_std": 0.13886137306690216,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29821428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3399740755558014,
+      "step": 1508
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 800.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 418.01788330078125,
+      "completions/mean_terminated_length": 418.01788330078125,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 1.5571318029404178,
+      "grad_norm": 0.6630829572677612,
+      "kl": 0.0830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0048,
+      "num_tokens": 176041605.0,
+      "reward": 1.2790180444717407,
+      "reward_std": 0.18763485550880432,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2790178656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3841826915740967,
+      "step": 1509
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1044.0,
+      "completions/max_terminated_length": 1044.0,
+      "completions/mean_length": 541.5625,
+      "completions/mean_terminated_length": 541.5625,
+      "completions/min_length": 293.0,
+      "completions/min_terminated_length": 293.0,
+      "epoch": 1.5581635285014186,
+      "grad_norm": 0.6752259135246277,
+      "kl": 0.0755615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.018,
+      "num_tokens": 176186274.0,
+      "reward": 1.3950893878936768,
+      "reward_std": 0.19315579533576965,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.395089328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.38787707686424255,
+      "step": 1510
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 913.0,
+      "completions/max_terminated_length": 913.0,
+      "completions/mean_length": 411.9910888671875,
+      "completions/mean_terminated_length": 411.9910888671875,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 1.5591952540624194,
+      "grad_norm": 0.6581578850746155,
+      "kl": 0.0977783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0303,
+      "num_tokens": 176304111.0,
+      "reward": 1.3366073369979858,
+      "reward_std": 0.13431201875209808,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3366071879863739,
+      "rewards/curriculum_aware_reward_fn/std": 0.3686462938785553,
+      "step": 1511
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1096.0,
+      "completions/max_terminated_length": 1096.0,
+      "completions/mean_length": 509.232177734375,
+      "completions/mean_terminated_length": 509.232177734375,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 1.5602269796234203,
+      "grad_norm": 0.6564598679542542,
+      "kl": 0.083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 176432090.0,
+      "reward": 1.3691965341567993,
+      "reward_std": 0.16052451729774475,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.37330886721611023,
+      "step": 1512
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1023.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 450.64288330078125,
+      "completions/mean_terminated_length": 450.64288330078125,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 1.561258705184421,
+      "grad_norm": 0.7571966052055359,
+      "kl": 0.0841064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0105,
+      "num_tokens": 176553409.0,
+      "reward": 1.3388394117355347,
+      "reward_std": 0.13540898263454437,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3388392925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.35347121953964233,
+      "step": 1513
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1221.0,
+      "completions/max_terminated_length": 1221.0,
+      "completions/mean_length": 482.1964416503906,
+      "completions/mean_terminated_length": 482.1964416503906,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.5622904307454217,
+      "grad_norm": 0.7158542275428772,
+      "kl": 0.080322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0177,
+      "num_tokens": 176671682.0,
+      "reward": 1.369642972946167,
+      "reward_std": 0.19208936393260956,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36964288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3834032714366913,
+      "step": 1514
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 894.0,
+      "completions/max_terminated_length": 894.0,
+      "completions/mean_length": 487.6339416503906,
+      "completions/mean_terminated_length": 487.6339416503906,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 1.5633221563064224,
+      "grad_norm": 0.7497124075889587,
+      "kl": 0.0919189453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0183,
+      "num_tokens": 176799685.0,
+      "reward": 1.3531250953674316,
+      "reward_std": 0.22372855246067047,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36205360293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.3610074520111084,
+      "step": 1515
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 919.0,
+      "completions/max_terminated_length": 919.0,
+      "completions/mean_length": 513.1875,
+      "completions/mean_terminated_length": 513.1875,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.5643538818674232,
+      "grad_norm": 0.615464448928833,
+      "kl": 0.077880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 176924222.0,
+      "reward": 1.3504465818405151,
+      "reward_std": 0.14913828670978546,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3504464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.34579166769981384,
+      "step": 1516
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1156.0,
+      "completions/max_terminated_length": 1156.0,
+      "completions/mean_length": 466.1964416503906,
+      "completions/mean_terminated_length": 466.1964416503906,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.565385607428424,
+      "grad_norm": 0.5593492984771729,
+      "kl": 0.07958984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0044,
+      "num_tokens": 177047360.0,
+      "reward": 1.3919644355773926,
+      "reward_std": 0.11146697402000427,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39196428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.382709264755249,
+      "step": 1517
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1133.0,
+      "completions/max_terminated_length": 1133.0,
+      "completions/mean_length": 541.0089721679688,
+      "completions/mean_terminated_length": 541.0089721679688,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 1.5664173329894249,
+      "grad_norm": 0.69778972864151,
+      "kl": 0.084228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0066,
+      "num_tokens": 177180326.0,
+      "reward": 1.322767972946167,
+      "reward_std": 0.15394167602062225,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32276788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.34640294313430786,
+      "step": 1518
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 924.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 443.5357360839844,
+      "completions/mean_terminated_length": 443.5357360839844,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.5674490585504257,
+      "grad_norm": 0.7092502117156982,
+      "kl": 0.098388671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0189,
+      "num_tokens": 177299134.0,
+      "reward": 1.3830357789993286,
+      "reward_std": 0.17929960787296295,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38303571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.36837565898895264,
+      "step": 1519
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 778.0,
+      "completions/max_terminated_length": 778.0,
+      "completions/mean_length": 427.2946472167969,
+      "completions/mean_terminated_length": 427.2946472167969,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 1.5684807841114263,
+      "grad_norm": 0.6543611288070679,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0126,
+      "num_tokens": 177412966.0,
+      "reward": 1.4455360174179077,
+      "reward_std": 0.17789359390735626,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3989035487174988,
+      "step": 1520
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1044.0,
+      "completions/max_terminated_length": 1044.0,
+      "completions/mean_length": 491.0714416503906,
+      "completions/mean_terminated_length": 491.0714416503906,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.5695125096724272,
+      "grad_norm": 0.7969875335693359,
+      "kl": 0.0858154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0099,
+      "num_tokens": 177543160.0,
+      "reward": 1.260267972946167,
+      "reward_std": 0.17761948704719543,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26026788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.36050131916999817,
+      "step": 1521
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 995.0,
+      "completions/max_terminated_length": 995.0,
+      "completions/mean_length": 483.232177734375,
+      "completions/mean_terminated_length": 483.232177734375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 1.5705442352334278,
+      "grad_norm": 0.6591067314147949,
+      "kl": 0.08642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 177666960.0,
+      "reward": 1.333035945892334,
+      "reward_std": 0.16056941449642181,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33303573727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.3560009300708771,
+      "step": 1522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1145.0,
+      "completions/mean_length": 533.2053833007812,
+      "completions/mean_terminated_length": 501.1081237792969,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "epoch": 1.5715759607944286,
+      "grad_norm": 0.7623335719108582,
+      "kl": 0.0941162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0693,
+      "num_tokens": 177798597.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.23580670356750488,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3276786208152771,
+      "rewards/curriculum_aware_reward_fn/std": 0.3404977321624756,
+      "step": 1523
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1348.0,
+      "completions/max_terminated_length": 1348.0,
+      "completions/mean_length": 490.5625305175781,
+      "completions/mean_terminated_length": 490.5625305175781,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "epoch": 1.5726076863554295,
+      "grad_norm": 0.4751555323600769,
+      "kl": 0.0804443359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 177921286.0,
+      "reward": 1.3678573369979858,
+      "reward_std": 0.10132154077291489,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3678571581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.4004421532154083,
+      "step": 1524
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 982.0,
+      "completions/max_terminated_length": 982.0,
+      "completions/mean_length": 458.107177734375,
+      "completions/mean_terminated_length": 458.107177734375,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 1.5736394119164303,
+      "grad_norm": 0.8363772630691528,
+      "kl": 0.110107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 178042034.0,
+      "reward": 1.4857144355773926,
+      "reward_std": 0.16093096137046814,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48571428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.338879257440567,
+      "step": 1525
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 948.0,
+      "completions/mean_length": 482.1250305175781,
+      "completions/mean_terminated_length": 449.56756591796875,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 1.5746711374774311,
+      "grad_norm": 0.7333095073699951,
+      "kl": 0.09375,
+      "learning_rate": 1e-06,
+      "loss": 0.041,
+      "num_tokens": 178159753.0,
+      "reward": 1.3812501430511475,
+      "reward_std": 0.15002723038196564,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3901786208152771,
+      "rewards/curriculum_aware_reward_fn/std": 0.38868218660354614,
+      "step": 1526
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 779.0,
+      "completions/max_terminated_length": 779.0,
+      "completions/mean_length": 420.8839416503906,
+      "completions/mean_terminated_length": 420.8839416503906,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 1.5757028630384318,
+      "grad_norm": 0.7141842842102051,
+      "kl": 0.0958251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0263,
+      "num_tokens": 178269507.0,
+      "reward": 1.580357313156128,
+      "reward_std": 0.16252008080482483,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5803571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4622876048088074,
+      "step": 1527
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 924.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 440.294677734375,
+      "completions/mean_terminated_length": 440.294677734375,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 1.5767345885994324,
+      "grad_norm": 0.7308587431907654,
+      "kl": 0.0908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.021,
+      "num_tokens": 178383325.0,
+      "reward": 1.453125238418579,
+      "reward_std": 0.22767595946788788,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.453125,
+      "rewards/curriculum_aware_reward_fn/std": 0.38104182481765747,
+      "step": 1528
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 788.0,
+      "completions/max_terminated_length": 788.0,
+      "completions/mean_length": 447.9285888671875,
+      "completions/mean_terminated_length": 447.9285888671875,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.5777663141604332,
+      "grad_norm": 0.7281290888786316,
+      "kl": 0.0958251953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0181,
+      "num_tokens": 178500105.0,
+      "reward": 1.4352680444717407,
+      "reward_std": 0.18116579949855804,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4352678954601288,
+      "rewards/curriculum_aware_reward_fn/std": 0.3676464259624481,
+      "step": 1529
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 457.90179443359375,
+      "completions/mean_terminated_length": 457.90179443359375,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 1.578798039721434,
+      "grad_norm": 0.7084707021713257,
+      "kl": 0.085693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 178619668.0,
+      "reward": 1.337053656578064,
+      "reward_std": 0.17020417749881744,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3370535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.36789795756340027,
+      "step": 1530
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 965.0,
+      "completions/max_terminated_length": 965.0,
+      "completions/mean_length": 498.8482360839844,
+      "completions/mean_terminated_length": 498.8482360839844,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 1.579829765282435,
+      "grad_norm": 0.657572865486145,
+      "kl": 0.0875244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0222,
+      "num_tokens": 178739274.0,
+      "reward": 1.338392972946167,
+      "reward_std": 0.23384582996368408,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33839288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.36650174856185913,
+      "step": 1531
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 892.0,
+      "completions/max_terminated_length": 892.0,
+      "completions/mean_length": 488.5625305175781,
+      "completions/mean_terminated_length": 488.5625305175781,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 1.5808614908434357,
+      "grad_norm": 0.7009193897247314,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 178864476.0,
+      "reward": 1.4299108982086182,
+      "reward_std": 0.2491622120141983,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42991071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3837029039859772,
+      "step": 1532
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1062.0,
+      "completions/max_terminated_length": 1062.0,
+      "completions/mean_length": 496.5535888671875,
+      "completions/mean_terminated_length": 496.5535888671875,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 1.5818932164044366,
+      "grad_norm": 0.5118904709815979,
+      "kl": 0.09130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0107,
+      "num_tokens": 178988270.0,
+      "reward": 1.3665179014205933,
+      "reward_std": 0.0743982195854187,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3665178716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.3804565966129303,
+      "step": 1533
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1134.0,
+      "completions/max_terminated_length": 1134.0,
+      "completions/mean_length": 497.5000305175781,
+      "completions/mean_terminated_length": 497.5000305175781,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 1.5829249419654372,
+      "grad_norm": 0.6875550746917725,
+      "kl": 0.0859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 179107340.0,
+      "reward": 1.309821605682373,
+      "reward_std": 0.12580278515815735,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3098214268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.33293938636779785,
+      "step": 1534
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 870.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 421.5089416503906,
+      "completions/mean_terminated_length": 421.5089416503906,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.5839566675264378,
+      "grad_norm": 0.6761475205421448,
+      "kl": 0.091064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0168,
+      "num_tokens": 179214567.0,
+      "reward": 1.4991072416305542,
+      "reward_std": 0.11850286275148392,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49910715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.39082854986190796,
+      "step": 1535
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 861.0,
+      "completions/max_terminated_length": 861.0,
+      "completions/mean_length": 441.3750305175781,
+      "completions/mean_terminated_length": 441.3750305175781,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 1.5849883930874387,
+      "grad_norm": 0.710762083530426,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 179324845.0,
+      "reward": 1.3973215818405151,
+      "reward_std": 0.17368969321250916,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3973214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.41961154341697693,
+      "step": 1536
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 465.46429443359375,
+      "completions/mean_terminated_length": 465.46429443359375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 1.5860201186484395,
+      "grad_norm": 0.6757381558418274,
+      "kl": 0.0928955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0155,
+      "num_tokens": 179443421.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.17009426653385162,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3792792558670044,
+      "step": 1537
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1391.0,
+      "completions/max_terminated_length": 1391.0,
+      "completions/mean_length": 463.20538330078125,
+      "completions/mean_terminated_length": 463.20538330078125,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 1.5870518442094403,
+      "grad_norm": 0.6932691931724548,
+      "kl": 0.0926513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 179550824.0,
+      "reward": 1.481696605682373,
+      "reward_std": 0.17351554334163666,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4816964268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.39134761691093445,
+      "step": 1538
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1168.0,
+      "completions/max_terminated_length": 1168.0,
+      "completions/mean_length": 485.39288330078125,
+      "completions/mean_terminated_length": 485.39288330078125,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.5880835697704412,
+      "grad_norm": 0.5837637186050415,
+      "kl": 0.092041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0179,
+      "num_tokens": 179672115.0,
+      "reward": 1.4620537757873535,
+      "reward_std": 0.09847518801689148,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4620535969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.3418448567390442,
+      "step": 1539
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1359.0,
+      "completions/max_terminated_length": 1359.0,
+      "completions/mean_length": 480.294677734375,
+      "completions/mean_terminated_length": 480.294677734375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.5891152953314418,
+      "grad_norm": 0.6784064769744873,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0131,
+      "num_tokens": 179794977.0,
+      "reward": 1.4705358743667603,
+      "reward_std": 0.16050057113170624,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4705357551574707,
+      "rewards/curriculum_aware_reward_fn/std": 0.362769216299057,
+      "step": 1540
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1400.0,
+      "completions/max_terminated_length": 1400.0,
+      "completions/mean_length": 537.482177734375,
+      "completions/mean_terminated_length": 537.482177734375,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.5901470208924426,
+      "grad_norm": 0.6119048595428467,
+      "kl": 0.095458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0236,
+      "num_tokens": 179925091.0,
+      "reward": 1.3517858982086182,
+      "reward_std": 0.1388634294271469,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3608004152774811,
+      "step": 1541
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3483.0,
+      "completions/max_terminated_length": 3483.0,
+      "completions/mean_length": 472.4464416503906,
+      "completions/mean_terminated_length": 472.4464416503906,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.5911787464534433,
+      "grad_norm": 0.7622206211090088,
+      "kl": 0.095458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0493,
+      "num_tokens": 180039318.0,
+      "reward": 1.4330357313156128,
+      "reward_std": 0.18418605625629425,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.3550506830215454,
+      "step": 1542
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 982.0,
+      "completions/max_terminated_length": 982.0,
+      "completions/mean_length": 472.4732360839844,
+      "completions/mean_terminated_length": 472.4732360839844,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 1.592210472014444,
+      "grad_norm": 0.623796284198761,
+      "kl": 0.097900390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0171,
+      "num_tokens": 180161339.0,
+      "reward": 1.4433037042617798,
+      "reward_std": 0.12381229549646378,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44330358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.37032344937324524,
+      "step": 1543
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1320.0,
+      "completions/max_terminated_length": 1320.0,
+      "completions/mean_length": 533.6428833007812,
+      "completions/mean_terminated_length": 533.6428833007812,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 1.593242197575445,
+      "grad_norm": 0.6682934761047363,
+      "kl": 0.0863037109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0106,
+      "num_tokens": 180298086.0,
+      "reward": 1.3924108743667603,
+      "reward_std": 0.1946992129087448,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3924107551574707,
+      "rewards/curriculum_aware_reward_fn/std": 0.378606379032135,
+      "step": 1544
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1005.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 477.169677734375,
+      "completions/mean_terminated_length": 477.169677734375,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.5942739231364458,
+      "grad_norm": 0.5888639092445374,
+      "kl": 0.0933837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0029,
+      "num_tokens": 180418183.0,
+      "reward": 1.4174107313156128,
+      "reward_std": 0.09873569756746292,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4174107611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.36556199193000793,
+      "step": 1545
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1291.0,
+      "completions/max_terminated_length": 1291.0,
+      "completions/mean_length": 483.89288330078125,
+      "completions/mean_terminated_length": 483.89288330078125,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.5953056486974466,
+      "grad_norm": 0.7629478573799133,
+      "kl": 0.10107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0194,
+      "num_tokens": 180547300.0,
+      "reward": 1.3714287281036377,
+      "reward_std": 0.17599254846572876,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37142854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3239476680755615,
+      "step": 1546
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1213.0,
+      "completions/max_terminated_length": 1213.0,
+      "completions/mean_length": 511.4285888671875,
+      "completions/mean_terminated_length": 511.4285888671875,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.5963373742584472,
+      "grad_norm": 0.6964306235313416,
+      "kl": 0.091064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 180673691.0,
+      "reward": 1.3066965341567993,
+      "reward_std": 0.1090642586350441,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30669641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.36959290504455566,
+      "step": 1547
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1661.0,
+      "completions/max_terminated_length": 1661.0,
+      "completions/mean_length": 520.3482666015625,
+      "completions/mean_terminated_length": 520.3482666015625,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 1.597369099819448,
+      "grad_norm": 0.6426225900650024,
+      "kl": 0.0911865234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0046,
+      "num_tokens": 180802992.0,
+      "reward": 1.428125023841858,
+      "reward_std": 0.14593568444252014,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4281250536441803,
+      "rewards/curriculum_aware_reward_fn/std": 0.3542395234107971,
+      "step": 1548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1078.0,
+      "completions/max_terminated_length": 1078.0,
+      "completions/mean_length": 482.89288330078125,
+      "completions/mean_terminated_length": 482.89288330078125,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 1.5984008253804487,
+      "grad_norm": 0.7396179437637329,
+      "kl": 0.0921630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 180928315.0,
+      "reward": 1.362946629524231,
+      "reward_std": 0.2182372510433197,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.362946480512619,
+      "rewards/curriculum_aware_reward_fn/std": 0.36072656512260437,
+      "step": 1549
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1362.0,
+      "completions/max_terminated_length": 1362.0,
+      "completions/mean_length": 488.1160888671875,
+      "completions/mean_terminated_length": 488.1160888671875,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 1.5994325509414495,
+      "grad_norm": 0.6970319151878357,
+      "kl": 0.107666015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0012,
+      "num_tokens": 181055763.0,
+      "reward": 1.391964316368103,
+      "reward_std": 0.15285947918891907,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39196428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.388839453458786,
+      "step": 1550
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1025.0,
+      "completions/max_terminated_length": 1025.0,
+      "completions/mean_length": 527.8035888671875,
+      "completions/mean_terminated_length": 527.8035888671875,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 1.6004642765024504,
+      "grad_norm": 0.6473516225814819,
+      "kl": 0.0899658203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0143,
+      "num_tokens": 181188055.0,
+      "reward": 1.3830358982086182,
+      "reward_std": 0.16586752235889435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38303571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3928157687187195,
+      "step": 1551
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1181.0,
+      "completions/max_terminated_length": 1181.0,
+      "completions/mean_length": 513.125,
+      "completions/mean_terminated_length": 513.125,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 1.6014960020634512,
+      "grad_norm": 0.7452658414840698,
+      "kl": 0.108642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0142,
+      "num_tokens": 181321265.0,
+      "reward": 1.3812501430511475,
+      "reward_std": 0.22256553173065186,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3901785910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.39110833406448364,
+      "step": 1552
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 823.0,
+      "completions/max_terminated_length": 823.0,
+      "completions/mean_length": 499.96429443359375,
+      "completions/mean_terminated_length": 499.96429443359375,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "epoch": 1.602527727624452,
+      "grad_norm": 0.6822240948677063,
+      "kl": 0.0894775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0016,
+      "num_tokens": 181450264.0,
+      "reward": 1.325446605682373,
+      "reward_std": 0.18837910890579224,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3254464566707611,
+      "rewards/curriculum_aware_reward_fn/std": 0.34809622168540955,
+      "step": 1553
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 862.0,
+      "completions/max_terminated_length": 862.0,
+      "completions/mean_length": 467.9285888671875,
+      "completions/mean_terminated_length": 467.9285888671875,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 1.6035594531854527,
+      "grad_norm": 0.8079191446304321,
+      "kl": 0.0975341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 181565609.0,
+      "reward": 1.427232265472412,
+      "reward_std": 0.20859649777412415,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43616074323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.38602909445762634,
+      "step": 1554
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 743.0,
+      "completions/max_terminated_length": 743.0,
+      "completions/mean_length": 464.2857360839844,
+      "completions/mean_terminated_length": 464.2857360839844,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 1.6045911787464533,
+      "grad_norm": 0.6506158113479614,
+      "kl": 0.088623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0151,
+      "num_tokens": 181687679.0,
+      "reward": 1.3696428537368774,
+      "reward_std": 0.20352153480052948,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36964288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3838142454624176,
+      "step": 1555
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1055.0,
+      "completions/max_terminated_length": 1055.0,
+      "completions/mean_length": 470.08038330078125,
+      "completions/mean_terminated_length": 470.08038330078125,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 1.6056229043074541,
+      "grad_norm": 0.8110823631286621,
+      "kl": 0.08837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0051,
+      "num_tokens": 181802999.0,
+      "reward": 1.3977681398391724,
+      "reward_std": 0.23524494469165802,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3977678716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.3921176493167877,
+      "step": 1556
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 931.0,
+      "completions/max_terminated_length": 931.0,
+      "completions/mean_length": 505.7232360839844,
+      "completions/mean_terminated_length": 505.7232360839844,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "epoch": 1.606654629868455,
+      "grad_norm": 0.7459127306938171,
+      "kl": 0.099853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0006,
+      "num_tokens": 181933259.0,
+      "reward": 1.2517858743667603,
+      "reward_std": 0.1917809098958969,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2517857253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.3410983979701996,
+      "step": 1557
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1151.0,
+      "completions/max_terminated_length": 1151.0,
+      "completions/mean_length": 453.857177734375,
+      "completions/mean_terminated_length": 453.857177734375,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 1.6076863554294558,
+      "grad_norm": 0.5741895437240601,
+      "kl": 0.109619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 182042618.0,
+      "reward": 1.3982144594192505,
+      "reward_std": 0.11688823997974396,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39821428060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.5040345191955566,
+      "step": 1558
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1043.0,
+      "completions/max_terminated_length": 1043.0,
+      "completions/mean_length": 475.544677734375,
+      "completions/mean_terminated_length": 475.544677734375,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 1.6087180809904567,
+      "grad_norm": 0.6165961623191833,
+      "kl": 0.0887451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 182170842.0,
+      "reward": 1.3263394832611084,
+      "reward_std": 0.12872597575187683,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3263393044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3373158574104309,
+      "step": 1559
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1046.0,
+      "completions/max_terminated_length": 1046.0,
+      "completions/mean_length": 464.5982360839844,
+      "completions/mean_terminated_length": 464.5982360839844,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.6097498065514573,
+      "grad_norm": 0.7019606828689575,
+      "kl": 0.0860595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0158,
+      "num_tokens": 182277995.0,
+      "reward": 1.5254465341567993,
+      "reward_std": 0.13480007648468018,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5254464745521545,
+      "rewards/curriculum_aware_reward_fn/std": 0.38072922825813293,
+      "step": 1560
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1210.0,
+      "completions/max_terminated_length": 1210.0,
+      "completions/mean_length": 544.0892944335938,
+      "completions/mean_terminated_length": 544.0892944335938,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "epoch": 1.6107815321124581,
+      "grad_norm": 0.6408945918083191,
+      "kl": 0.085693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0198,
+      "num_tokens": 182418789.0,
+      "reward": 1.211160659790039,
+      "reward_std": 0.19551792740821838,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21116070449352264,
+      "rewards/curriculum_aware_reward_fn/std": 0.30117708444595337,
+      "step": 1561
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 885.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 474.0000305175781,
+      "completions/mean_terminated_length": 474.0000305175781,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 1.6118132576734587,
+      "grad_norm": 0.6941054463386536,
+      "kl": 0.0889892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 182536564.0,
+      "reward": 1.4075894355773926,
+      "reward_std": 0.16013428568840027,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40758928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.377951443195343,
+      "step": 1562
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 973.0,
+      "completions/max_terminated_length": 973.0,
+      "completions/mean_length": 481.02679443359375,
+      "completions/mean_terminated_length": 481.02679443359375,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.6128449832344596,
+      "grad_norm": 0.7757281064987183,
+      "kl": 0.0960693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 182659424.0,
+      "reward": 1.2102679014205933,
+      "reward_std": 0.1673770248889923,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21026785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.33665943145751953,
+      "step": 1563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 700.0,
+      "completions/max_terminated_length": 700.0,
+      "completions/mean_length": 415.96429443359375,
+      "completions/mean_terminated_length": 415.96429443359375,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.6138767087954604,
+      "grad_norm": 0.7108755707740784,
+      "kl": 0.0997314453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0193,
+      "num_tokens": 182779505.0,
+      "reward": 1.4982143640518188,
+      "reward_std": 0.15602374076843262,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3791246712207794,
+      "step": 1564
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 940.0,
+      "completions/max_terminated_length": 940.0,
+      "completions/mean_length": 432.46429443359375,
+      "completions/mean_terminated_length": 432.46429443359375,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 1.6149084343564613,
+      "grad_norm": 0.6921008229255676,
+      "kl": 0.096923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0005,
+      "num_tokens": 182896625.0,
+      "reward": 1.497321605682373,
+      "reward_std": 0.1866346001625061,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5062499642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.356858491897583,
+      "step": 1565
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 812.0,
+      "completions/max_terminated_length": 812.0,
+      "completions/mean_length": 437.52679443359375,
+      "completions/mean_terminated_length": 437.52679443359375,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 1.615940159917462,
+      "grad_norm": 0.7729184031486511,
+      "kl": 0.08447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0029,
+      "num_tokens": 183018582.0,
+      "reward": 1.4276787042617798,
+      "reward_std": 0.2603314518928528,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.43774518370628357,
+      "step": 1566
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 800.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 437.9821472167969,
+      "completions/mean_terminated_length": 437.9821472167969,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 1.6169718854784627,
+      "grad_norm": 0.6360601186752319,
+      "kl": 0.095703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0048,
+      "num_tokens": 183134241.0,
+      "reward": 1.4236608743667603,
+      "reward_std": 0.12023750692605972,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4236607253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.4888836741447449,
+      "step": 1567
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1051.0,
+      "completions/max_terminated_length": 1051.0,
+      "completions/mean_length": 459.6160888671875,
+      "completions/mean_terminated_length": 459.6160888671875,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 1.6180036110394636,
+      "grad_norm": 0.6790236830711365,
+      "kl": 0.0955810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0291,
+      "num_tokens": 183260149.0,
+      "reward": 1.3875001668930054,
+      "reward_std": 0.17968744039535522,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38750001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3827061057090759,
+      "step": 1568
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1218.0,
+      "completions/max_terminated_length": 1218.0,
+      "completions/mean_length": 483.294677734375,
+      "completions/mean_terminated_length": 483.294677734375,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 1.6190353366004642,
+      "grad_norm": 0.7099661231040955,
+      "kl": 0.0947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0116,
+      "num_tokens": 183381926.0,
+      "reward": 1.3736608028411865,
+      "reward_std": 0.20166639983654022,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37366071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.38845735788345337,
+      "step": 1569
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 691.0,
+      "completions/max_terminated_length": 691.0,
+      "completions/mean_length": 422.6875305175781,
+      "completions/mean_terminated_length": 422.6875305175781,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 1.620067062161465,
+      "grad_norm": 0.5651583075523376,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0032,
+      "num_tokens": 183489943.0,
+      "reward": 1.5111607313156128,
+      "reward_std": 0.1064288541674614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5111607313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.3640184998512268,
+      "step": 1570
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 876.0,
+      "completions/max_terminated_length": 876.0,
+      "completions/mean_length": 477.3035888671875,
+      "completions/mean_terminated_length": 477.3035888671875,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 1.6210987877224659,
+      "grad_norm": 0.6416997313499451,
+      "kl": 0.0816650390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 183616404.0,
+      "reward": 1.4120537042617798,
+      "reward_std": 0.13270309567451477,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3730955123901367,
+      "step": 1571
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1035.0,
+      "completions/max_terminated_length": 1035.0,
+      "completions/mean_length": 507.76788330078125,
+      "completions/mean_terminated_length": 507.76788330078125,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 1.6221305132834667,
+      "grad_norm": 0.6909761428833008,
+      "kl": 0.0885009765625,
+      "learning_rate": 1e-06,
+      "loss": -0.015,
+      "num_tokens": 183746851.0,
+      "reward": 1.3093750476837158,
+      "reward_std": 0.22479885816574097,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3407418429851532,
+      "step": 1572
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1098.0,
+      "completions/max_terminated_length": 1098.0,
+      "completions/mean_length": 518.5,
+      "completions/mean_terminated_length": 518.5,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.6231622388444675,
+      "grad_norm": 0.6929990649223328,
+      "kl": 0.0860595703125,
+      "learning_rate": 1e-06,
+      "loss": -0.001,
+      "num_tokens": 183871976.0,
+      "reward": 1.3669644594192505,
+      "reward_std": 0.18981127440929413,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36696428060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.36745738983154297,
+      "step": 1573
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1227.0,
+      "completions/max_terminated_length": 1227.0,
+      "completions/mean_length": 489.4285888671875,
+      "completions/mean_terminated_length": 489.4285888671875,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 1.6241939644054681,
+      "grad_norm": 0.7451035976409912,
+      "kl": 0.0928955078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0145,
+      "num_tokens": 183992529.0,
+      "reward": 1.4111608266830444,
+      "reward_std": 0.18551412224769592,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41116073727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.36234408617019653,
+      "step": 1574
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 893.0,
+      "completions/max_terminated_length": 893.0,
+      "completions/mean_length": 458.294677734375,
+      "completions/mean_terminated_length": 458.294677734375,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 1.6252256899664688,
+      "grad_norm": 0.7171928882598877,
+      "kl": 0.098388671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0206,
+      "num_tokens": 184108195.0,
+      "reward": 1.439732313156128,
+      "reward_std": 0.15318885445594788,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4397321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.38672444224357605,
+      "step": 1575
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1066.0,
+      "completions/max_terminated_length": 1066.0,
+      "completions/mean_length": 468.1875305175781,
+      "completions/mean_terminated_length": 468.1875305175781,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 1.6262574155274696,
+      "grad_norm": 0.7789207100868225,
+      "kl": 0.078369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.029,
+      "num_tokens": 184219012.0,
+      "reward": 1.5397323369979858,
+      "reward_std": 0.2542872130870819,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5397321581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.3428903818130493,
+      "step": 1576
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1379.0,
+      "completions/max_terminated_length": 1379.0,
+      "completions/mean_length": 462.0000305175781,
+      "completions/mean_terminated_length": 462.0000305175781,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 1.6272891410884704,
+      "grad_norm": 0.6634514927864075,
+      "kl": 0.0869140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0032,
+      "num_tokens": 184335793.0,
+      "reward": 1.3437501192092896,
+      "reward_std": 0.1134234219789505,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3749549686908722,
+      "step": 1577
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 931.0,
+      "completions/max_terminated_length": 931.0,
+      "completions/mean_length": 490.40179443359375,
+      "completions/mean_terminated_length": 490.40179443359375,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 1.6283208666494713,
+      "grad_norm": 0.5586764812469482,
+      "kl": 0.0897216796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0039,
+      "num_tokens": 184452255.0,
+      "reward": 1.266517996788025,
+      "reward_std": 0.10985580831766129,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.26651787757873535,
+      "rewards/curriculum_aware_reward_fn/std": 0.35586389899253845,
+      "step": 1578
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1086.0,
+      "completions/max_terminated_length": 1086.0,
+      "completions/mean_length": 470.5714416503906,
+      "completions/mean_terminated_length": 470.5714416503906,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 1.6293525922104721,
+      "grad_norm": 0.7115040421485901,
+      "kl": 0.087158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0008,
+      "num_tokens": 184573930.0,
+      "reward": 1.33973228931427,
+      "reward_std": 0.1624385416507721,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3397321403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.3505554497241974,
+      "step": 1579
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 846.0,
+      "completions/max_terminated_length": 846.0,
+      "completions/mean_length": 451.357177734375,
+      "completions/mean_terminated_length": 451.357177734375,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 1.6303843177714727,
+      "grad_norm": 0.6336712837219238,
+      "kl": 0.0947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0262,
+      "num_tokens": 184691339.0,
+      "reward": 1.3830358982086182,
+      "reward_std": 0.1295861452817917,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38303571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.39561524987220764,
+      "step": 1580
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 906.0,
+      "completions/max_terminated_length": 906.0,
+      "completions/mean_length": 461.5000305175781,
+      "completions/mean_terminated_length": 461.5000305175781,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 1.6314160433324736,
+      "grad_norm": 0.7340649962425232,
+      "kl": 0.08837890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0086,
+      "num_tokens": 184807350.0,
+      "reward": 1.3513394594192505,
+      "reward_std": 0.19727714359760284,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35133931040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.393326073884964,
+      "step": 1581
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 960.0,
+      "completions/max_terminated_length": 960.0,
+      "completions/mean_length": 478.794677734375,
+      "completions/mean_terminated_length": 478.794677734375,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 1.6324477688934742,
+      "grad_norm": 0.6357326507568359,
+      "kl": 0.0958251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0032,
+      "num_tokens": 184927181.0,
+      "reward": 1.364732265472412,
+      "reward_std": 0.14448943734169006,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36473211646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.3300040066242218,
+      "step": 1582
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 458.1785888671875,
+      "completions/mean_terminated_length": 458.1785888671875,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.633479494454475,
+      "grad_norm": 0.8217315673828125,
+      "kl": 0.09326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0012,
+      "num_tokens": 185043345.0,
+      "reward": 1.4013394117355347,
+      "reward_std": 0.2089354693889618,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4013392925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.34986868500709534,
+      "step": 1583
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1103.0,
+      "completions/max_terminated_length": 1103.0,
+      "completions/mean_length": 497.8839416503906,
+      "completions/mean_terminated_length": 497.8839416503906,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 1.6345112200154759,
+      "grad_norm": 0.728714108467102,
+      "kl": 0.095947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 185170837.0,
+      "reward": 1.2767857313156128,
+      "reward_std": 0.18643386662006378,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2857142984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3156167268753052,
+      "step": 1584
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1203.0,
+      "completions/max_terminated_length": 1203.0,
+      "completions/mean_length": 492.169677734375,
+      "completions/mean_terminated_length": 492.169677734375,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 1.6355429455764767,
+      "grad_norm": 0.768435001373291,
+      "kl": 0.1011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 185288008.0,
+      "reward": 1.3888393640518188,
+      "reward_std": 0.17833693325519562,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3888392746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.4032922089099884,
+      "step": 1585
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 853.0,
+      "completions/max_terminated_length": 853.0,
+      "completions/mean_length": 414.2946472167969,
+      "completions/mean_terminated_length": 414.2946472167969,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 1.6365746711374776,
+      "grad_norm": 0.8125675916671753,
+      "kl": 0.0985107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0051,
+      "num_tokens": 185392712.0,
+      "reward": 1.4598214626312256,
+      "reward_std": 0.20808333158493042,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4598214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.36612406373023987,
+      "step": 1586
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 955.0,
+      "completions/max_terminated_length": 955.0,
+      "completions/mean_length": 509.2857360839844,
+      "completions/mean_terminated_length": 509.2857360839844,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 1.6376063966984782,
+      "grad_norm": 0.6974402666091919,
+      "kl": 0.087646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0205,
+      "num_tokens": 185516105.0,
+      "reward": 1.3656251430511475,
+      "reward_std": 0.17538422346115112,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3656249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.33678603172302246,
+      "step": 1587
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1294.0,
+      "completions/max_terminated_length": 1294.0,
+      "completions/mean_length": 506.169677734375,
+      "completions/mean_terminated_length": 506.169677734375,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 1.638638122259479,
+      "grad_norm": 0.596748411655426,
+      "kl": 0.0869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 185638848.0,
+      "reward": 1.3928571939468384,
+      "reward_std": 0.15835700929164886,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3928571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.38278597593307495,
+      "step": 1588
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 787.0,
+      "completions/max_terminated_length": 787.0,
+      "completions/mean_length": 463.0625305175781,
+      "completions/mean_terminated_length": 463.0625305175781,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 1.6396698478204796,
+      "grad_norm": 0.8352671265602112,
+      "kl": 0.0986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 185759059.0,
+      "reward": 1.3794643878936768,
+      "reward_std": 0.1755049079656601,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3794642984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3583969175815582,
+      "step": 1589
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 947.0,
+      "completions/max_terminated_length": 947.0,
+      "completions/mean_length": 441.96429443359375,
+      "completions/mean_terminated_length": 441.96429443359375,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.6407015733814805,
+      "grad_norm": 0.7398374676704407,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0097,
+      "num_tokens": 185873834.0,
+      "reward": 1.4803574085235596,
+      "reward_std": 0.16756568849086761,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48035717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3915410041809082,
+      "step": 1590
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 996.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 494.8839416503906,
+      "completions/mean_terminated_length": 494.8839416503906,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 1.6417332989424813,
+      "grad_norm": 0.6949180960655212,
+      "kl": 0.087646484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0062,
+      "num_tokens": 185994039.0,
+      "reward": 1.3848215341567993,
+      "reward_std": 0.1573912799358368,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38482144474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.3977889120578766,
+      "step": 1591
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 695.0,
+      "completions/max_terminated_length": 695.0,
+      "completions/mean_length": 416.5446472167969,
+      "completions/mean_terminated_length": 416.5446472167969,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 1.6427650245034822,
+      "grad_norm": 0.7075119018554688,
+      "kl": 0.10205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0003,
+      "num_tokens": 186105666.0,
+      "reward": 1.6562501192092896,
+      "reward_std": 0.17057617008686066,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.65625,
+      "rewards/curriculum_aware_reward_fn/std": 0.2933286130428314,
+      "step": 1592
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1830.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 479.8035888671875,
+      "completions/mean_terminated_length": 479.8035888671875,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.643796750064483,
+      "grad_norm": 0.6478801369667053,
+      "kl": 0.084716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0262,
+      "num_tokens": 186223020.0,
+      "reward": 1.5705358982086182,
+      "reward_std": 0.1583946794271469,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5705357193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.37044858932495117,
+      "step": 1593
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 914.0,
+      "completions/mean_length": 527.6875,
+      "completions/mean_terminated_length": 495.5405578613281,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 1.6448284756254836,
+      "grad_norm": 0.6908033490180969,
+      "kl": 0.0889892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0491,
+      "num_tokens": 186356573.0,
+      "reward": 1.3339287042617798,
+      "reward_std": 0.2090333253145218,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3469901978969574,
+      "step": 1594
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1386.0,
+      "completions/max_terminated_length": 1386.0,
+      "completions/mean_length": 470.2500305175781,
+      "completions/mean_terminated_length": 470.2500305175781,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 1.6458602011864842,
+      "grad_norm": 0.7876476645469666,
+      "kl": 0.093994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0019,
+      "num_tokens": 186474166.0,
+      "reward": 1.5098215341567993,
+      "reward_std": 0.2147691696882248,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5098214745521545,
+      "rewards/curriculum_aware_reward_fn/std": 0.35111403465270996,
+      "step": 1595
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1365.0,
+      "completions/max_terminated_length": 1365.0,
+      "completions/mean_length": 479.77679443359375,
+      "completions/mean_terminated_length": 479.77679443359375,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 1.646891926747485,
+      "grad_norm": 0.6431217193603516,
+      "kl": 0.0885009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0137,
+      "num_tokens": 186595423.0,
+      "reward": 1.2848215103149414,
+      "reward_std": 0.1607237607240677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28482142090797424,
+      "rewards/curriculum_aware_reward_fn/std": 0.3498287498950958,
+      "step": 1596
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1157.0,
+      "completions/max_terminated_length": 1157.0,
+      "completions/mean_length": 553.7142944335938,
+      "completions/mean_terminated_length": 553.7142944335938,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 1.647923652308486,
+      "grad_norm": 0.7455097436904907,
+      "kl": 0.093505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0118,
+      "num_tokens": 186731765.0,
+      "reward": 1.4160715341567993,
+      "reward_std": 0.24613210558891296,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.36693274974823,
+      "step": 1597
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 802.0,
+      "completions/max_terminated_length": 802.0,
+      "completions/mean_length": 470.107177734375,
+      "completions/mean_terminated_length": 470.107177734375,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.6489553778694868,
+      "grad_norm": 0.8170499801635742,
+      "kl": 0.119384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0128,
+      "num_tokens": 186852681.0,
+      "reward": 1.3946430683135986,
+      "reward_std": 0.14803080260753632,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3946428596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.35869866609573364,
+      "step": 1598
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1045.0,
+      "completions/max_terminated_length": 1045.0,
+      "completions/mean_length": 520.357177734375,
+      "completions/mean_terminated_length": 520.357177734375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 1.6499871034304876,
+      "grad_norm": 0.853003203868866,
+      "kl": 0.084716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0289,
+      "num_tokens": 186971252.0,
+      "reward": 1.3031251430511475,
+      "reward_std": 0.25454646348953247,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3120535910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.3552214205265045,
+      "step": 1599
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 513.4732666015625,
+      "completions/mean_terminated_length": 513.4732666015625,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 1.6510188289914882,
+      "grad_norm": 0.7009985446929932,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0085,
+      "num_tokens": 187103297.0,
+      "reward": 1.3316963911056519,
+      "reward_std": 0.20880287885665894,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33169645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.4298541247844696,
+      "step": 1600
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2025.0,
+      "completions/max_terminated_length": 2025.0,
+      "completions/mean_length": 486.3750305175781,
+      "completions/mean_terminated_length": 486.3750305175781,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.652050554552489,
+      "grad_norm": 0.766791045665741,
+      "kl": 0.0958251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0189,
+      "num_tokens": 187217845.0,
+      "reward": 1.3294644355773926,
+      "reward_std": 0.23240506649017334,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32946428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3554944396018982,
+      "step": 1601
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1122.0,
+      "completions/max_terminated_length": 1122.0,
+      "completions/mean_length": 509.6250305175781,
+      "completions/mean_terminated_length": 509.6250305175781,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 1.6530822801134897,
+      "grad_norm": 0.7172535061836243,
+      "kl": 0.09619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0198,
+      "num_tokens": 187348857.0,
+      "reward": 1.3571429252624512,
+      "reward_std": 0.15233953297138214,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.38425412774086,
+      "step": 1602
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 847.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 497.482177734375,
+      "completions/mean_terminated_length": 497.482177734375,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "epoch": 1.6541140056744905,
+      "grad_norm": 0.7336816787719727,
+      "kl": 0.09521484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0047,
+      "num_tokens": 187471383.0,
+      "reward": 1.4017857313156128,
+      "reward_std": 0.1852165162563324,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4017857015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.3883395493030548,
+      "step": 1603
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1053.0,
+      "completions/max_terminated_length": 1053.0,
+      "completions/mean_length": 486.1160888671875,
+      "completions/mean_terminated_length": 486.1160888671875,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.6551457312354914,
+      "grad_norm": 0.7177641987800598,
+      "kl": 0.087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0039,
+      "num_tokens": 187597900.0,
+      "reward": 1.455357313156128,
+      "reward_std": 0.20527367293834686,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.455357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.33270373940467834,
+      "step": 1604
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 850.0,
+      "completions/max_terminated_length": 850.0,
+      "completions/mean_length": 489.544677734375,
+      "completions/mean_terminated_length": 489.544677734375,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "epoch": 1.6561774567964922,
+      "grad_norm": 0.785552442073822,
+      "kl": 0.0997314453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 187715155.0,
+      "reward": 1.3361608982086182,
+      "reward_std": 0.20291408896446228,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34508928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3495788872241974,
+      "step": 1605
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1128.0,
+      "completions/max_terminated_length": 1128.0,
+      "completions/mean_length": 488.6875305175781,
+      "completions/mean_terminated_length": 488.6875305175781,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 1.657209182357493,
+      "grad_norm": 0.7447234988212585,
+      "kl": 0.0999755859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0046,
+      "num_tokens": 187839377.0,
+      "reward": 1.3928571939468384,
+      "reward_std": 0.14517280459403992,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.392857164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.35277900099754333,
+      "step": 1606
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 895.0,
+      "completions/max_terminated_length": 895.0,
+      "completions/mean_length": 505.107177734375,
+      "completions/mean_terminated_length": 505.107177734375,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "epoch": 1.6582409079184937,
+      "grad_norm": 0.728800892829895,
+      "kl": 0.1026611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0157,
+      "num_tokens": 187971360.0,
+      "reward": 1.4093750715255737,
+      "reward_std": 0.2143835425376892,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40937501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.3755195438861847,
+      "step": 1607
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 984.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 566.8839721679688,
+      "completions/mean_terminated_length": 566.8839721679688,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.6592726334794945,
+      "grad_norm": 0.6726198196411133,
+      "kl": 0.0870361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0211,
+      "num_tokens": 188107098.0,
+      "reward": 1.3361608982086182,
+      "reward_std": 0.19342978298664093,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3774147629737854,
+      "step": 1608
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 958.0,
+      "completions/max_terminated_length": 958.0,
+      "completions/mean_length": 474.39288330078125,
+      "completions/mean_terminated_length": 474.39288330078125,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 1.6603043590404951,
+      "grad_norm": 0.73554927110672,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0131,
+      "num_tokens": 188224112.0,
+      "reward": 1.3187501430511475,
+      "reward_std": 0.14685951173305511,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3741130530834198,
+      "step": 1609
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1246.0,
+      "completions/max_terminated_length": 1246.0,
+      "completions/mean_length": 493.65179443359375,
+      "completions/mean_terminated_length": 493.65179443359375,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 1.661336084601496,
+      "grad_norm": 0.7111493349075317,
+      "kl": 0.093017578125,
+      "learning_rate": 1e-06,
+      "loss": -0.004,
+      "num_tokens": 188357325.0,
+      "reward": 1.2754465341567993,
+      "reward_std": 0.13824760913848877,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27544644474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.3375852108001709,
+      "step": 1610
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 951.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 466.5000305175781,
+      "completions/mean_terminated_length": 466.5000305175781,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 1.6623678101624968,
+      "grad_norm": 0.7171905636787415,
+      "kl": 0.1085205078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0034,
+      "num_tokens": 188475730.0,
+      "reward": 1.525892972946167,
+      "reward_std": 0.16963417828083038,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.40647807717323303,
+      "step": 1611
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 953.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 521.2589721679688,
+      "completions/mean_terminated_length": 521.2589721679688,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.6633995357234976,
+      "grad_norm": 0.6403311491012573,
+      "kl": 0.10107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0336,
+      "num_tokens": 188600779.0,
+      "reward": 1.3357144594192505,
+      "reward_std": 0.1563766449689865,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33571431040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.38720694184303284,
+      "step": 1612
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1240.0,
+      "completions/max_terminated_length": 1240.0,
+      "completions/mean_length": 537.3392944335938,
+      "completions/mean_terminated_length": 537.3392944335938,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 1.6644312612844985,
+      "grad_norm": 0.759138286113739,
+      "kl": 0.0963134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0245,
+      "num_tokens": 188727398.0,
+      "reward": 1.4004465341567993,
+      "reward_std": 0.14537791907787323,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40044641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.3470918834209442,
+      "step": 1613
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 979.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 533.875,
+      "completions/mean_terminated_length": 533.875,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.665462986845499,
+      "grad_norm": 0.6733611822128296,
+      "kl": 0.0927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0243,
+      "num_tokens": 188860895.0,
+      "reward": 1.3901787996292114,
+      "reward_std": 0.13679122924804688,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3901785910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.35072892904281616,
+      "step": 1614
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 973.0,
+      "completions/max_terminated_length": 973.0,
+      "completions/mean_length": 487.4464416503906,
+      "completions/mean_terminated_length": 487.4464416503906,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 1.6664947124064997,
+      "grad_norm": 0.7703580856323242,
+      "kl": 0.114013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0124,
+      "num_tokens": 188985426.0,
+      "reward": 1.38660728931427,
+      "reward_std": 0.13961534202098846,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38660717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3742850422859192,
+      "step": 1615
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1385.0,
+      "completions/max_terminated_length": 1385.0,
+      "completions/mean_length": 562.6785888671875,
+      "completions/mean_terminated_length": 562.6785888671875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.6675264379675006,
+      "grad_norm": 0.5722839832305908,
+      "kl": 0.0897216796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0022,
+      "num_tokens": 189108982.0,
+      "reward": 1.3946430683135986,
+      "reward_std": 0.16097982227802277,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3946428596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.39708155393600464,
+      "step": 1616
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1191.0,
+      "completions/max_terminated_length": 1191.0,
+      "completions/mean_length": 529.4285888671875,
+      "completions/mean_terminated_length": 529.4285888671875,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.6685581635285014,
+      "grad_norm": 0.7239696979522705,
+      "kl": 0.0936279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0024,
+      "num_tokens": 189234862.0,
+      "reward": 1.3558037281036377,
+      "reward_std": 0.16733594238758087,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35580357909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.33582931756973267,
+      "step": 1617
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 698.0,
+      "completions/max_terminated_length": 698.0,
+      "completions/mean_length": 476.4464416503906,
+      "completions/mean_terminated_length": 476.4464416503906,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 1.6695898890895022,
+      "grad_norm": 0.7931858897209167,
+      "kl": 0.1002197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0144,
+      "num_tokens": 189360758.0,
+      "reward": 1.339285969734192,
+      "reward_std": 0.21934357285499573,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.37637779116630554,
+      "step": 1618
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1023.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 512.1964721679688,
+      "completions/mean_terminated_length": 512.1964721679688,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 1.670621614650503,
+      "grad_norm": 0.7632045745849609,
+      "kl": 0.095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 189487242.0,
+      "reward": 1.345089316368103,
+      "reward_std": 0.18178538978099823,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34508928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3602401316165924,
+      "step": 1619
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 940.0,
+      "completions/max_terminated_length": 940.0,
+      "completions/mean_length": 471.14288330078125,
+      "completions/mean_terminated_length": 471.14288330078125,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.6716533402115037,
+      "grad_norm": 1.4711685180664062,
+      "kl": 0.1971435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0177,
+      "num_tokens": 189603591.0,
+      "reward": 1.3727679252624512,
+      "reward_std": 0.17584413290023804,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3816964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.38782110810279846,
+      "step": 1620
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 472.14288330078125,
+      "completions/mean_terminated_length": 472.14288330078125,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 1.6726850657725045,
+      "grad_norm": 0.7945736646652222,
+      "kl": 0.101318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 189716979.0,
+      "reward": 1.5071431398391724,
+      "reward_std": 0.18330030143260956,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5071429014205933,
+      "rewards/curriculum_aware_reward_fn/std": 0.37643763422966003,
+      "step": 1621
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1087.0,
+      "completions/max_terminated_length": 1087.0,
+      "completions/mean_length": 459.0357360839844,
+      "completions/mean_terminated_length": 459.0357360839844,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 1.6737167913335051,
+      "grad_norm": 0.8028322458267212,
+      "kl": 0.11181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0049,
+      "num_tokens": 189832028.0,
+      "reward": 1.4718750715255737,
+      "reward_std": 0.18403606116771698,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47187501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.39385947585105896,
+      "step": 1622
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1351.0,
+      "completions/max_terminated_length": 1351.0,
+      "completions/mean_length": 522.107177734375,
+      "completions/mean_terminated_length": 522.107177734375,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.674748516894506,
+      "grad_norm": 0.6720342040061951,
+      "kl": 0.1004638671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0078,
+      "num_tokens": 189957487.0,
+      "reward": 1.3482143878936768,
+      "reward_std": 0.1303005814552307,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.34989428520202637,
+      "step": 1623
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1197.0,
+      "completions/max_terminated_length": 1197.0,
+      "completions/mean_length": 515.4910888671875,
+      "completions/mean_terminated_length": 515.4910888671875,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 1.6757802424555068,
+      "grad_norm": 0.6873416900634766,
+      "kl": 0.1005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 190071397.0,
+      "reward": 1.356250286102295,
+      "reward_std": 0.12795774638652802,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.37830376625061035,
+      "step": 1624
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1272.0,
+      "completions/max_terminated_length": 1272.0,
+      "completions/mean_length": 505.7589416503906,
+      "completions/mean_terminated_length": 505.7589416503906,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 1.6768119680165077,
+      "grad_norm": 0.8274422883987427,
+      "kl": 0.10498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0366,
+      "num_tokens": 190203222.0,
+      "reward": 1.3004465103149414,
+      "reward_std": 0.2362377792596817,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30044645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.3548566401004791,
+      "step": 1625
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1102.0,
+      "completions/max_terminated_length": 1102.0,
+      "completions/mean_length": 455.45538330078125,
+      "completions/mean_terminated_length": 455.45538330078125,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 1.6778436935775085,
+      "grad_norm": 0.5675816535949707,
+      "kl": 0.095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0001,
+      "num_tokens": 190322679.0,
+      "reward": 1.4080358743667603,
+      "reward_std": 0.09176620095968246,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4080357253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.38088053464889526,
+      "step": 1626
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1245.0,
+      "completions/max_terminated_length": 1245.0,
+      "completions/mean_length": 488.294677734375,
+      "completions/mean_terminated_length": 488.294677734375,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 1.6788754191385091,
+      "grad_norm": 0.7279961705207825,
+      "kl": 0.0906982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0116,
+      "num_tokens": 190439395.0,
+      "reward": 1.5200895071029663,
+      "reward_std": 0.1635468453168869,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5200892686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.37208083271980286,
+      "step": 1627
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1001.0,
+      "completions/max_terminated_length": 1001.0,
+      "completions/mean_length": 495.919677734375,
+      "completions/mean_terminated_length": 495.919677734375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 1.67990714469951,
+      "grad_norm": 0.8204648494720459,
+      "kl": 0.106201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0208,
+      "num_tokens": 190565743.0,
+      "reward": 1.254910945892334,
+      "reward_std": 0.21975882351398468,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2549107074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.31785881519317627,
+      "step": 1628
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 824.0,
+      "completions/max_terminated_length": 824.0,
+      "completions/mean_length": 484.5000305175781,
+      "completions/mean_terminated_length": 484.5000305175781,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 1.6809388702605106,
+      "grad_norm": 0.6664847731590271,
+      "kl": 0.099853515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0033,
+      "num_tokens": 190682951.0,
+      "reward": 1.4169644117355347,
+      "reward_std": 0.10810566693544388,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41696426272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.3685590326786041,
+      "step": 1629
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 919.0,
+      "completions/max_terminated_length": 919.0,
+      "completions/mean_length": 532.7410888671875,
+      "completions/mean_terminated_length": 532.7410888671875,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.6819705958215114,
+      "grad_norm": 0.7823235392570496,
+      "kl": 0.1058349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0251,
+      "num_tokens": 190816524.0,
+      "reward": 1.2272322177886963,
+      "reward_std": 0.1962169110774994,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2272321730852127,
+      "rewards/curriculum_aware_reward_fn/std": 0.28179672360420227,
+      "step": 1630
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1198.0,
+      "completions/max_terminated_length": 1198.0,
+      "completions/mean_length": 531.5892944335938,
+      "completions/mean_terminated_length": 531.5892944335938,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 1.6830023213825123,
+      "grad_norm": 0.5920828580856323,
+      "kl": 0.0965576171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0145,
+      "num_tokens": 190947917.0,
+      "reward": 1.3424108028411865,
+      "reward_std": 0.1325589269399643,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34241074323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.3899185359477997,
+      "step": 1631
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1003.0,
+      "completions/max_terminated_length": 1003.0,
+      "completions/mean_length": 550.6964721679688,
+      "completions/mean_terminated_length": 550.6964721679688,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "epoch": 1.684034046943513,
+      "grad_norm": 0.7132331132888794,
+      "kl": 0.0924072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 191081415.0,
+      "reward": 1.1437500715255737,
+      "reward_std": 0.15005722641944885,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.2836478054523468,
+      "step": 1632
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 826.0,
+      "completions/max_terminated_length": 826.0,
+      "completions/mean_length": 489.6964416503906,
+      "completions/mean_terminated_length": 489.6964416503906,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.685065772504514,
+      "grad_norm": 0.7506689429283142,
+      "kl": 0.10888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 191205764.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.1464381217956543,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36250001192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.371416836977005,
+      "step": 1633
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1004.0,
+      "completions/mean_length": 543.732177734375,
+      "completions/mean_terminated_length": 511.729736328125,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 1.6860974980655146,
+      "grad_norm": 0.7990245819091797,
+      "kl": 0.0947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 191333218.0,
+      "reward": 1.4200894832611084,
+      "reward_std": 0.2204861342906952,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4290178716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.306508868932724,
+      "step": 1634
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1108.0,
+      "completions/max_terminated_length": 1108.0,
+      "completions/mean_length": 557.4107666015625,
+      "completions/mean_terminated_length": 557.4107666015625,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 1.6871292236265152,
+      "grad_norm": 0.6546155214309692,
+      "kl": 0.0948486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0145,
+      "num_tokens": 191460573.0,
+      "reward": 1.2379465103149414,
+      "reward_std": 0.14424559473991394,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224,
+      "rewards/curriculum_aware_reward_fn/std": 0.3062620759010315,
+      "step": 1635
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 986.0,
+      "completions/max_terminated_length": 986.0,
+      "completions/mean_length": 470.58929443359375,
+      "completions/mean_terminated_length": 470.58929443359375,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 1.688160949187516,
+      "grad_norm": 0.8744813799858093,
+      "kl": 0.1126708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0053,
+      "num_tokens": 191581424.0,
+      "reward": 1.4120537042617798,
+      "reward_std": 0.21662786602973938,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3654101490974426,
+      "step": 1636
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 942.0,
+      "completions/max_terminated_length": 942.0,
+      "completions/mean_length": 504.5535888671875,
+      "completions/mean_terminated_length": 504.5535888671875,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 1.6891926747485169,
+      "grad_norm": 0.8263741731643677,
+      "kl": 0.0985107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 191711812.0,
+      "reward": 1.427232265472412,
+      "reward_std": 0.23317548632621765,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43616074323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.38439202308654785,
+      "step": 1637
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 949.0,
+      "completions/max_terminated_length": 949.0,
+      "completions/mean_length": 491.7589416503906,
+      "completions/mean_terminated_length": 491.7589416503906,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.6902244003095177,
+      "grad_norm": 0.8051194548606873,
+      "kl": 0.1053466796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0078,
+      "num_tokens": 191836329.0,
+      "reward": 1.4125001430511475,
+      "reward_std": 0.20308110117912292,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4125000536441803,
+      "rewards/curriculum_aware_reward_fn/std": 0.388430655002594,
+      "step": 1638
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 459.4107360839844,
+      "completions/mean_terminated_length": 459.4107360839844,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.6912561258705185,
+      "grad_norm": 0.798630952835083,
+      "kl": 0.105712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.011,
+      "num_tokens": 191952148.0,
+      "reward": 1.409821629524231,
+      "reward_std": 0.13976800441741943,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40982145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.3854820132255554,
+      "step": 1639
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1081.0,
+      "completions/max_terminated_length": 1081.0,
+      "completions/mean_length": 525.3482666015625,
+      "completions/mean_terminated_length": 525.3482666015625,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 1.6922878514315192,
+      "grad_norm": 0.8330475091934204,
+      "kl": 0.09716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 192074351.0,
+      "reward": 1.3924108743667603,
+      "reward_std": 0.18537965416908264,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3924107253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.3159577250480652,
+      "step": 1640
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1070.0,
+      "completions/max_terminated_length": 1070.0,
+      "completions/mean_length": 478.0982360839844,
+      "completions/mean_terminated_length": 478.0982360839844,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 1.69331957699252,
+      "grad_norm": 0.7114593982696533,
+      "kl": 0.1033935546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0304,
+      "num_tokens": 192190296.0,
+      "reward": 1.427232265472412,
+      "reward_std": 0.20058543980121613,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42723217606544495,
+      "rewards/curriculum_aware_reward_fn/std": 0.40569642186164856,
+      "step": 1641
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 903.0,
+      "completions/max_terminated_length": 903.0,
+      "completions/mean_length": 523.6160888671875,
+      "completions/mean_terminated_length": 523.6160888671875,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 1.6943513025535206,
+      "grad_norm": 0.7585659027099609,
+      "kl": 0.1019287109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0042,
+      "num_tokens": 192312333.0,
+      "reward": 1.2558037042617798,
+      "reward_std": 0.15198412537574768,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25580358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3548724949359894,
+      "step": 1642
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 448.64288330078125,
+      "completions/mean_terminated_length": 448.64288330078125,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 1.6953830281145215,
+      "grad_norm": 0.7520537972450256,
+      "kl": 0.0989990234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 192423488.0,
+      "reward": 1.411160945892334,
+      "reward_std": 0.17044828832149506,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4111607074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.3557576537132263,
+      "step": 1643
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1032.0,
+      "completions/max_terminated_length": 1032.0,
+      "completions/mean_length": 518.9553833007812,
+      "completions/mean_terminated_length": 518.9553833007812,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "epoch": 1.6964147536755223,
+      "grad_norm": 0.7355285286903381,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0086,
+      "num_tokens": 192540824.0,
+      "reward": 1.3866074085235596,
+      "reward_std": 0.22177164256572723,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.36619433760643005,
+      "step": 1644
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1186.0,
+      "completions/max_terminated_length": 1186.0,
+      "completions/mean_length": 522.9375,
+      "completions/mean_terminated_length": 522.9375,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.6974464792365231,
+      "grad_norm": 0.7334874868392944,
+      "kl": 0.103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0423,
+      "num_tokens": 192679627.0,
+      "reward": 1.321428656578064,
+      "reward_std": 0.20126299560070038,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3214285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.34095218777656555,
+      "step": 1645
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1029.0,
+      "completions/max_terminated_length": 1029.0,
+      "completions/mean_length": 515.5535888671875,
+      "completions/mean_terminated_length": 515.5535888671875,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 1.698478204797524,
+      "grad_norm": 0.496624231338501,
+      "kl": 0.0982666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 192801223.0,
+      "reward": 1.3687502145767212,
+      "reward_std": 0.09063854068517685,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36875003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.3997817933559418,
+      "step": 1646
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1053.0,
+      "completions/max_terminated_length": 1053.0,
+      "completions/mean_length": 521.7232666015625,
+      "completions/mean_terminated_length": 521.7232666015625,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.6995099303585246,
+      "grad_norm": 0.7509146928787231,
+      "kl": 0.110107421875,
+      "learning_rate": 1e-06,
+      "loss": -0.005,
+      "num_tokens": 192937356.0,
+      "reward": 1.3142858743667603,
+      "reward_std": 0.18662287294864655,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3142856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.35459840297698975,
+      "step": 1647
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1137.0,
+      "completions/max_terminated_length": 1137.0,
+      "completions/mean_length": 476.83929443359375,
+      "completions/mean_terminated_length": 476.83929443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 1.7005416559195254,
+      "grad_norm": 0.6993504762649536,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0083,
+      "num_tokens": 193053902.0,
+      "reward": 1.3772321939468384,
+      "reward_std": 0.12742431461811066,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3772321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.37781307101249695,
+      "step": 1648
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 878.0,
+      "completions/max_terminated_length": 878.0,
+      "completions/mean_length": 534.375,
+      "completions/mean_terminated_length": 534.375,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "epoch": 1.701573381480526,
+      "grad_norm": 0.7881371378898621,
+      "kl": 0.1044921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0062,
+      "num_tokens": 193184795.0,
+      "reward": 1.296875,
+      "reward_std": 0.21468262374401093,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3058035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.361537367105484,
+      "step": 1649
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1022.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 526.9464721679688,
+      "completions/mean_terminated_length": 526.9464721679688,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 1.702605107041527,
+      "grad_norm": 0.7941908240318298,
+      "kl": 0.1002197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 193312581.0,
+      "reward": 1.2870537042617798,
+      "reward_std": 0.20489533245563507,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28705358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.36222195625305176,
+      "step": 1650
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1183.0,
+      "completions/max_terminated_length": 1183.0,
+      "completions/mean_length": 491.9285888671875,
+      "completions/mean_terminated_length": 491.9285888671875,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 1.7036368326025277,
+      "grad_norm": 0.6850597858428955,
+      "kl": 0.10009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0125,
+      "num_tokens": 193435297.0,
+      "reward": 1.3665181398391724,
+      "reward_std": 0.16435275971889496,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3665178716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.3578205108642578,
+      "step": 1651
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 825.0,
+      "completions/max_terminated_length": 825.0,
+      "completions/mean_length": 487.08929443359375,
+      "completions/mean_terminated_length": 487.08929443359375,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.7046685581635286,
+      "grad_norm": 0.7362051010131836,
+      "kl": 0.104736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 193551976.0,
+      "reward": 1.3866074085235596,
+      "reward_std": 0.21596673130989075,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.4046454429626465,
+      "step": 1652
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 499.9910888671875,
+      "completions/mean_terminated_length": 499.9910888671875,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 1.7057002837245294,
+      "grad_norm": 0.7955490946769714,
+      "kl": 0.1060791015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0096,
+      "num_tokens": 193674076.0,
+      "reward": 1.3584822416305542,
+      "reward_std": 0.14499978721141815,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35848215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.3501536548137665,
+      "step": 1653
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 902.0,
+      "completions/max_terminated_length": 902.0,
+      "completions/mean_length": 485.294677734375,
+      "completions/mean_terminated_length": 485.294677734375,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.70673200928553,
+      "grad_norm": 0.7266811728477478,
+      "kl": 0.111572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0167,
+      "num_tokens": 193794614.0,
+      "reward": 1.4517858028411865,
+      "reward_std": 0.1658746898174286,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45178574323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.38891908526420593,
+      "step": 1654
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 876.0,
+      "completions/max_terminated_length": 876.0,
+      "completions/mean_length": 456.0089416503906,
+      "completions/mean_terminated_length": 456.0089416503906,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 1.7077637348465307,
+      "grad_norm": 0.7519125938415527,
+      "kl": 0.1019287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 193909450.0,
+      "reward": 1.466071605682373,
+      "reward_std": 0.15212328732013702,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4660714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.3495952785015106,
+      "step": 1655
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1129.0,
+      "completions/max_terminated_length": 1129.0,
+      "completions/mean_length": 532.4375,
+      "completions/mean_terminated_length": 532.4375,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 1.7087954604075315,
+      "grad_norm": 0.8475361466407776,
+      "kl": 0.106689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0071,
+      "num_tokens": 194028863.0,
+      "reward": 1.3598215579986572,
+      "reward_std": 0.18619807064533234,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35982146859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.3328717052936554,
+      "step": 1656
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 781.0,
+      "completions/max_terminated_length": 781.0,
+      "completions/mean_length": 447.1607360839844,
+      "completions/mean_terminated_length": 447.1607360839844,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 1.7098271859685323,
+      "grad_norm": 0.8995668888092041,
+      "kl": 0.1075439453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0012,
+      "num_tokens": 194141168.0,
+      "reward": 1.4339287281036377,
+      "reward_std": 0.20498405396938324,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44285720586776733,
+      "rewards/curriculum_aware_reward_fn/std": 0.36129504442214966,
+      "step": 1657
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 879.0,
+      "completions/max_terminated_length": 879.0,
+      "completions/mean_length": 471.6339416503906,
+      "completions/mean_terminated_length": 471.6339416503906,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 1.7108589115295332,
+      "grad_norm": 0.7614728808403015,
+      "kl": 0.110107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0247,
+      "num_tokens": 194269150.0,
+      "reward": 1.3857144117355347,
+      "reward_std": 0.16251114010810852,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3857143223285675,
+      "rewards/curriculum_aware_reward_fn/std": 0.3841703534126282,
+      "step": 1658
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 983.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 499.482177734375,
+      "completions/mean_terminated_length": 499.482177734375,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 1.711890637090534,
+      "grad_norm": 0.6195462942123413,
+      "kl": 0.1011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 194391410.0,
+      "reward": 1.2531250715255737,
+      "reward_std": 0.1709987223148346,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25312501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.35088568925857544,
+      "step": 1659
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 850.0,
+      "completions/max_terminated_length": 850.0,
+      "completions/mean_length": 447.419677734375,
+      "completions/mean_terminated_length": 447.419677734375,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 1.7129223626515349,
+      "grad_norm": 0.8020883202552795,
+      "kl": 0.115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0236,
+      "num_tokens": 194509547.0,
+      "reward": 1.5098215341567993,
+      "reward_std": 0.1900186985731125,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5098214149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.36778125166893005,
+      "step": 1660
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1608.0,
+      "completions/max_terminated_length": 1608.0,
+      "completions/mean_length": 494.919677734375,
+      "completions/mean_terminated_length": 494.919677734375,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 1.7139540882125355,
+      "grad_norm": 0.7013669013977051,
+      "kl": 0.1051025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 194628699.0,
+      "reward": 1.3687500953674316,
+      "reward_std": 0.1604437232017517,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36875003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.37326928973197937,
+      "step": 1661
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 888.0,
+      "completions/max_terminated_length": 888.0,
+      "completions/mean_length": 433.6339416503906,
+      "completions/mean_terminated_length": 433.6339416503906,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.714985813773536,
+      "grad_norm": 0.7422285079956055,
+      "kl": 0.1162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0122,
+      "num_tokens": 194741211.0,
+      "reward": 1.463392972946167,
+      "reward_std": 0.17094425857067108,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46339288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3927830159664154,
+      "step": 1662
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1277.0,
+      "completions/max_terminated_length": 1277.0,
+      "completions/mean_length": 504.6964416503906,
+      "completions/mean_terminated_length": 504.6964416503906,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 1.716017539334537,
+      "grad_norm": 0.6004372835159302,
+      "kl": 0.104736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 194861010.0,
+      "reward": 1.4325894117355347,
+      "reward_std": 0.11247504502534866,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4325892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.48331207036972046,
+      "step": 1663
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 949.0,
+      "completions/max_terminated_length": 949.0,
+      "completions/mean_length": 462.1339416503906,
+      "completions/mean_terminated_length": 462.1339416503906,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 1.7170492648955378,
+      "grad_norm": 0.8087993264198303,
+      "kl": 0.0948486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0189,
+      "num_tokens": 194970512.0,
+      "reward": 1.4084821939468384,
+      "reward_std": 0.19191642105579376,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4084821343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3480245769023895,
+      "step": 1664
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1514.0,
+      "completions/max_terminated_length": 1514.0,
+      "completions/mean_length": 529.294677734375,
+      "completions/mean_terminated_length": 529.294677734375,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 1.7180809904565386,
+      "grad_norm": 0.6965283751487732,
+      "kl": 0.09814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 195099016.0,
+      "reward": 1.3593751192092896,
+      "reward_std": 0.1859753429889679,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3683035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.3423551023006439,
+      "step": 1665
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 846.0,
+      "completions/max_terminated_length": 846.0,
+      "completions/mean_length": 420.46429443359375,
+      "completions/mean_terminated_length": 420.46429443359375,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 1.7191127160175395,
+      "grad_norm": 0.681612491607666,
+      "kl": 0.1048583984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0202,
+      "num_tokens": 195206938.0,
+      "reward": 1.4933037757873535,
+      "reward_std": 0.11284186691045761,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4933035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.397089421749115,
+      "step": 1666
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 815.0,
+      "completions/max_terminated_length": 815.0,
+      "completions/mean_length": 480.1607360839844,
+      "completions/mean_terminated_length": 480.1607360839844,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 1.72014444157854,
+      "grad_norm": 0.8302071690559387,
+      "kl": 0.1201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0097,
+      "num_tokens": 195328749.0,
+      "reward": 1.387946605682373,
+      "reward_std": 0.1679256558418274,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3879464268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.37273311614990234,
+      "step": 1667
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1076.0,
+      "completions/max_terminated_length": 1076.0,
+      "completions/mean_length": 509.7232360839844,
+      "completions/mean_terminated_length": 509.7232360839844,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 1.721176167139541,
+      "grad_norm": 0.7681064009666443,
+      "kl": 0.0999755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.013,
+      "num_tokens": 195457798.0,
+      "reward": 1.282142996788025,
+      "reward_std": 0.16930948197841644,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28214287757873535,
+      "rewards/curriculum_aware_reward_fn/std": 0.3344254791736603,
+      "step": 1668
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 848.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 456.1160888671875,
+      "completions/mean_terminated_length": 456.1160888671875,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 1.7222078927005415,
+      "grad_norm": 0.8119245767593384,
+      "kl": 0.0977783203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0156,
+      "num_tokens": 195581025.0,
+      "reward": 1.4183037281036377,
+      "reward_std": 0.21370477974414825,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41830357909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.37156811356544495,
+      "step": 1669
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 986.0,
+      "completions/max_terminated_length": 986.0,
+      "completions/mean_length": 474.7589416503906,
+      "completions/mean_terminated_length": 474.7589416503906,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 1.7232396182615424,
+      "grad_norm": 0.7375963926315308,
+      "kl": 0.1015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0114,
+      "num_tokens": 195695749.0,
+      "reward": 1.3129465579986572,
+      "reward_std": 0.15211425721645355,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3129464089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.35947567224502563,
+      "step": 1670
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1140.0,
+      "completions/max_terminated_length": 1140.0,
+      "completions/mean_length": 494.15179443359375,
+      "completions/mean_terminated_length": 494.15179443359375,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 1.7242713438225432,
+      "grad_norm": 0.7717375159263611,
+      "kl": 0.105224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 195822375.0,
+      "reward": 1.2986608743667603,
+      "reward_std": 0.146214559674263,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2986607551574707,
+      "rewards/curriculum_aware_reward_fn/std": 0.3745543956756592,
+      "step": 1671
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 986.0,
+      "completions/max_terminated_length": 986.0,
+      "completions/mean_length": 509.8839416503906,
+      "completions/mean_terminated_length": 509.8839416503906,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.725303069383544,
+      "grad_norm": 0.7136673331260681,
+      "kl": 0.108154296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0078,
+      "num_tokens": 195945226.0,
+      "reward": 1.2910715341567993,
+      "reward_std": 0.14577467739582062,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29107144474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.35937973856925964,
+      "step": 1672
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1102.0,
+      "completions/max_terminated_length": 1102.0,
+      "completions/mean_length": 481.5714416503906,
+      "completions/mean_terminated_length": 481.5714416503906,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 1.726334794944545,
+      "grad_norm": 0.7931310534477234,
+      "kl": 0.1044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0145,
+      "num_tokens": 196066859.0,
+      "reward": 1.401785969734192,
+      "reward_std": 0.2273138165473938,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3675110340118408,
+      "step": 1673
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 703.0,
+      "completions/max_terminated_length": 703.0,
+      "completions/mean_length": 450.6160888671875,
+      "completions/mean_terminated_length": 450.6160888671875,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 1.7273665205055455,
+      "grad_norm": 0.7282100319862366,
+      "kl": 0.10791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0172,
+      "num_tokens": 196189491.0,
+      "reward": 1.302232265472412,
+      "reward_std": 0.1299796998500824,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30223211646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.36769235134124756,
+      "step": 1674
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1093.0,
+      "completions/max_terminated_length": 1093.0,
+      "completions/mean_length": 511.732177734375,
+      "completions/mean_terminated_length": 511.732177734375,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 1.7283982460665464,
+      "grad_norm": 0.7027313113212585,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": -0.002,
+      "num_tokens": 196314775.0,
+      "reward": 1.3165180683135986,
+      "reward_std": 0.11481791734695435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3165178596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.38264068961143494,
+      "step": 1675
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 433.8214416503906,
+      "completions/mean_terminated_length": 433.8214416503906,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 1.729429971627547,
+      "grad_norm": 0.7571617364883423,
+      "kl": 0.105712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0046,
+      "num_tokens": 196425103.0,
+      "reward": 1.4602679014205933,
+      "reward_std": 0.2094980925321579,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4602678716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.37248486280441284,
+      "step": 1676
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1158.0,
+      "completions/max_terminated_length": 1158.0,
+      "completions/mean_length": 525.3392944335938,
+      "completions/mean_terminated_length": 525.3392944335938,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 1.7304616971885478,
+      "grad_norm": 0.6972119808197021,
+      "kl": 0.093505859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0071,
+      "num_tokens": 196566989.0,
+      "reward": 1.224107265472412,
+      "reward_std": 0.17237694561481476,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.22410713136196136,
+      "rewards/curriculum_aware_reward_fn/std": 0.33402958512306213,
+      "step": 1677
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 813.0,
+      "completions/max_terminated_length": 813.0,
+      "completions/mean_length": 448.7589416503906,
+      "completions/mean_terminated_length": 448.7589416503906,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 1.7314934227495486,
+      "grad_norm": 0.7930586338043213,
+      "kl": 0.1019287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0157,
+      "num_tokens": 196675343.0,
+      "reward": 1.4683037996292114,
+      "reward_std": 0.18954959511756897,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4683036208152771,
+      "rewards/curriculum_aware_reward_fn/std": 0.3429466784000397,
+      "step": 1678
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 936.0,
+      "completions/max_terminated_length": 936.0,
+      "completions/mean_length": 475.7589416503906,
+      "completions/mean_terminated_length": 475.7589416503906,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 1.7325251483105495,
+      "grad_norm": 0.736160159111023,
+      "kl": 0.111083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0005,
+      "num_tokens": 196793168.0,
+      "reward": 1.3767858743667603,
+      "reward_std": 0.22271791100502014,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3767857253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.4599710702896118,
+      "step": 1679
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 877.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 444.544677734375,
+      "completions/mean_terminated_length": 444.544677734375,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.7335568738715503,
+      "grad_norm": 0.7676131129264832,
+      "kl": 0.0968017578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0091,
+      "num_tokens": 196910710.0,
+      "reward": 1.5258928537368774,
+      "reward_std": 0.13285356760025024,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.35131555795669556,
+      "step": 1680
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 983.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 484.9732360839844,
+      "completions/mean_terminated_length": 484.9732360839844,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.734588599432551,
+      "grad_norm": 0.729081392288208,
+      "kl": 0.115478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 197032192.0,
+      "reward": 1.235267996788025,
+      "reward_std": 0.1558685004711151,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23526786267757416,
+      "rewards/curriculum_aware_reward_fn/std": 0.31755247712135315,
+      "step": 1681
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 870.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 467.9732360839844,
+      "completions/mean_terminated_length": 467.9732360839844,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.7356203249935516,
+      "grad_norm": 0.7132700085639954,
+      "kl": 0.1109619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.026,
+      "num_tokens": 197146324.0,
+      "reward": 1.4147323369979858,
+      "reward_std": 0.15210282802581787,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4147321283817291,
+      "rewards/curriculum_aware_reward_fn/std": 0.4016653597354889,
+      "step": 1682
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 953.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 467.4464416503906,
+      "completions/mean_terminated_length": 467.4464416503906,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 1.7366520505545524,
+      "grad_norm": 0.8161846995353699,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 197265668.0,
+      "reward": 1.3549107313156128,
+      "reward_std": 0.20558597147464752,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3638392984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.35515573620796204,
+      "step": 1683
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1031.0,
+      "completions/max_terminated_length": 1031.0,
+      "completions/mean_length": 518.3839721679688,
+      "completions/mean_terminated_length": 518.3839721679688,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "epoch": 1.7376837761155532,
+      "grad_norm": 0.7936047911643982,
+      "kl": 0.102294921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0053,
+      "num_tokens": 197393664.0,
+      "reward": 1.4267858266830444,
+      "reward_std": 0.20954875648021698,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42678573727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.3882235586643219,
+      "step": 1684
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 979.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 530.5625,
+      "completions/mean_terminated_length": 530.5625,
+      "completions/min_length": 292.0,
+      "completions/min_terminated_length": 292.0,
+      "epoch": 1.738715501676554,
+      "grad_norm": 0.6724871397018433,
+      "kl": 0.0965576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0118,
+      "num_tokens": 197532005.0,
+      "reward": 1.2790179252624512,
+      "reward_std": 0.17441387474536896,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2790178656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3681361675262451,
+      "step": 1685
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 730.0,
+      "completions/max_terminated_length": 730.0,
+      "completions/mean_length": 447.9107360839844,
+      "completions/mean_terminated_length": 447.9107360839844,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 1.739747227237555,
+      "grad_norm": 0.6834338903427124,
+      "kl": 0.1043701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 197650050.0,
+      "reward": 1.4276787042617798,
+      "reward_std": 0.15436527132987976,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3626006841659546,
+      "step": 1686
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1346.0,
+      "completions/max_terminated_length": 1346.0,
+      "completions/mean_length": 449.4732360839844,
+      "completions/mean_terminated_length": 449.4732360839844,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 1.7407789527985555,
+      "grad_norm": 0.8147615194320679,
+      "kl": 0.0997314453125,
+      "learning_rate": 1e-06,
+      "loss": -0.027,
+      "num_tokens": 197770629.0,
+      "reward": 1.454017996788025,
+      "reward_std": 0.1792232096195221,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46294644474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.3991965651512146,
+      "step": 1687
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 527.3482666015625,
+      "completions/mean_terminated_length": 527.3482666015625,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.7418106783595564,
+      "grad_norm": 1.57326078414917,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0062,
+      "num_tokens": 197906272.0,
+      "reward": 1.1660715341567993,
+      "reward_std": 0.18589244782924652,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17499999701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.27963942289352417,
+      "step": 1688
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 838.0,
+      "completions/max_terminated_length": 838.0,
+      "completions/mean_length": 441.33929443359375,
+      "completions/mean_terminated_length": 441.33929443359375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.742842403920557,
+      "grad_norm": 0.6428574323654175,
+      "kl": 0.1011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 198026703.0,
+      "reward": 1.3812501430511475,
+      "reward_std": 0.13791553676128387,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3812500536441803,
+      "rewards/curriculum_aware_reward_fn/std": 0.3720000386238098,
+      "step": 1689
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 824.0,
+      "completions/max_terminated_length": 824.0,
+      "completions/mean_length": 483.6785888671875,
+      "completions/mean_terminated_length": 483.6785888671875,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 1.7438741294815578,
+      "grad_norm": 0.6593337655067444,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 198157714.0,
+      "reward": 1.2169643640518188,
+      "reward_std": 0.09230455011129379,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21696427464485168,
+      "rewards/curriculum_aware_reward_fn/std": 0.33054375648498535,
+      "step": 1690
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 902.0,
+      "completions/max_terminated_length": 902.0,
+      "completions/mean_length": 459.4285888671875,
+      "completions/mean_terminated_length": 459.4285888671875,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.7449058550425587,
+      "grad_norm": 0.5632081627845764,
+      "kl": 0.0977783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0048,
+      "num_tokens": 198276083.0,
+      "reward": 1.3674108982086182,
+      "reward_std": 0.1574571579694748,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3781004250049591,
+      "step": 1691
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 975.0,
+      "completions/max_terminated_length": 975.0,
+      "completions/mean_length": 487.5000305175781,
+      "completions/mean_terminated_length": 487.5000305175781,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 1.7459375806035595,
+      "grad_norm": 0.7337881326675415,
+      "kl": 0.09326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 198399990.0,
+      "reward": 1.2334822416305542,
+      "reward_std": 0.1932225376367569,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.23348215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.3446102738380432,
+      "step": 1692
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 471.669677734375,
+      "completions/mean_terminated_length": 471.669677734375,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 1.7469693061645604,
+      "grad_norm": 0.7575318217277527,
+      "kl": 0.1068115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 198517336.0,
+      "reward": 1.3642858266830444,
+      "reward_std": 0.20217333734035492,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36428573727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.33887922763824463,
+      "step": 1693
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1079.0,
+      "completions/max_terminated_length": 1079.0,
+      "completions/mean_length": 470.982177734375,
+      "completions/mean_terminated_length": 470.982177734375,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 1.748001031725561,
+      "grad_norm": 0.7248097658157349,
+      "kl": 0.10400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0024,
+      "num_tokens": 198636180.0,
+      "reward": 1.497321605682373,
+      "reward_std": 0.10733328759670258,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4973214566707611,
+      "rewards/curriculum_aware_reward_fn/std": 0.35570254921913147,
+      "step": 1694
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 864.0,
+      "completions/max_terminated_length": 864.0,
+      "completions/mean_length": 454.6875305175781,
+      "completions/mean_terminated_length": 454.6875305175781,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 1.7490327572865618,
+      "grad_norm": 0.7512885332107544,
+      "kl": 0.0963134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0195,
+      "num_tokens": 198759184.0,
+      "reward": 1.4040179252624512,
+      "reward_std": 0.15522757172584534,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4040178954601288,
+      "rewards/curriculum_aware_reward_fn/std": 0.3663269579410553,
+      "step": 1695
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1483.0,
+      "completions/max_terminated_length": 1483.0,
+      "completions/mean_length": 470.76788330078125,
+      "completions/mean_terminated_length": 470.76788330078125,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 1.7500644828475624,
+      "grad_norm": 0.7817060947418213,
+      "kl": 0.095458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 198878935.0,
+      "reward": 1.3401787281036377,
+      "reward_std": 0.16438253223896027,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34017854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.37161925435066223,
+      "step": 1696
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 797.0,
+      "completions/max_terminated_length": 797.0,
+      "completions/mean_length": 430.8660888671875,
+      "completions/mean_terminated_length": 430.8660888671875,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 1.7510962084085633,
+      "grad_norm": 0.8149338960647583,
+      "kl": 0.1177978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 198989807.0,
+      "reward": 1.4232144355773926,
+      "reward_std": 0.23195189237594604,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42321428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.39346742630004883,
+      "step": 1697
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1002.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 464.3214416503906,
+      "completions/mean_terminated_length": 464.3214416503906,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 1.7521279339695641,
+      "grad_norm": 0.6136691570281982,
+      "kl": 0.0950927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0178,
+      "num_tokens": 199112049.0,
+      "reward": 1.35535728931427,
+      "reward_std": 0.14693090319633484,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3642857074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.4077199101448059,
+      "step": 1698
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 825.0,
+      "completions/max_terminated_length": 825.0,
+      "completions/mean_length": 461.1250305175781,
+      "completions/mean_terminated_length": 461.1250305175781,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 1.753159659530565,
+      "grad_norm": 0.8194558620452881,
+      "kl": 0.1011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0162,
+      "num_tokens": 199228557.0,
+      "reward": 1.4160715341567993,
+      "reward_std": 0.21462929248809814,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42500004172325134,
+      "rewards/curriculum_aware_reward_fn/std": 0.3724765181541443,
+      "step": 1699
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1068.0,
+      "completions/max_terminated_length": 1068.0,
+      "completions/mean_length": 444.8035888671875,
+      "completions/mean_terminated_length": 444.8035888671875,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.7541913850915658,
+      "grad_norm": 0.6593208312988281,
+      "kl": 0.09814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0086,
+      "num_tokens": 199351280.0,
+      "reward": 1.3517858982086182,
+      "reward_std": 0.1568783074617386,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.4667275846004486,
+      "step": 1700
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 792.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 466.8660888671875,
+      "completions/mean_terminated_length": 466.8660888671875,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.7552231106525664,
+      "grad_norm": 0.4401684105396271,
+      "kl": 0.103759765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 199466744.0,
+      "reward": 1.2950893640518188,
+      "reward_std": 0.07254636287689209,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2950893044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3630427122116089,
+      "step": 1701
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1024.0,
+      "completions/mean_length": 459.14288330078125,
+      "completions/mean_terminated_length": 459.14288330078125,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 1.756254836213567,
+      "grad_norm": 0.6597514748573303,
+      "kl": 0.1014404296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0039,
+      "num_tokens": 199580958.0,
+      "reward": 1.3616071939468384,
+      "reward_std": 0.12213817983865738,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3616071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.35163596272468567,
+      "step": 1702
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 775.0,
+      "completions/max_terminated_length": 775.0,
+      "completions/mean_length": 442.02679443359375,
+      "completions/mean_terminated_length": 442.02679443359375,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.7572865617745679,
+      "grad_norm": 0.6738682985305786,
+      "kl": 0.0911865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 199695341.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.11475715041160583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3777303099632263,
+      "step": 1703
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 880.0,
+      "completions/max_terminated_length": 880.0,
+      "completions/mean_length": 443.419677734375,
+      "completions/mean_terminated_length": 443.419677734375,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.7583182873355687,
+      "grad_norm": 0.6813015341758728,
+      "kl": 0.100830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 199806457.0,
+      "reward": 1.3517858982086182,
+      "reward_std": 0.1438947170972824,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3782920837402344,
+      "step": 1704
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 927.0,
+      "completions/mean_length": 547.0892944335938,
+      "completions/mean_terminated_length": 515.1171264648438,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 1.7593500128965696,
+      "grad_norm": 0.7241870164871216,
+      "kl": 0.0975341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0523,
+      "num_tokens": 199945123.0,
+      "reward": 1.2933037281036377,
+      "reward_std": 0.21056966483592987,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30223217606544495,
+      "rewards/curriculum_aware_reward_fn/std": 0.3545958995819092,
+      "step": 1705
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1118.0,
+      "completions/max_terminated_length": 1118.0,
+      "completions/mean_length": 471.0000305175781,
+      "completions/mean_terminated_length": 471.0000305175781,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 1.7603817384575704,
+      "grad_norm": 0.7351222038269043,
+      "kl": 0.1163330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0074,
+      "num_tokens": 200069513.0,
+      "reward": 1.4196429252624512,
+      "reward_std": 0.13800209760665894,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4196428954601288,
+      "rewards/curriculum_aware_reward_fn/std": 0.352874755859375,
+      "step": 1706
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 866.0,
+      "completions/max_terminated_length": 866.0,
+      "completions/mean_length": 425.26788330078125,
+      "completions/mean_terminated_length": 425.26788330078125,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 1.761413464018571,
+      "grad_norm": 0.7676159739494324,
+      "kl": 0.100830078125,
+      "learning_rate": 1e-06,
+      "loss": -0.003,
+      "num_tokens": 200193380.0,
+      "reward": 1.4200894832611084,
+      "reward_std": 0.16104407608509064,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4200893044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.36560821533203125,
+      "step": 1707
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1157.0,
+      "completions/max_terminated_length": 1157.0,
+      "completions/mean_length": 460.2589416503906,
+      "completions/mean_terminated_length": 460.2589416503906,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.7624451895795719,
+      "grad_norm": 0.7323415279388428,
+      "kl": 0.1181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 200316273.0,
+      "reward": 1.2410714626312256,
+      "reward_std": 0.17039266228675842,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.3420262932777405,
+      "step": 1708
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1358.0,
+      "completions/max_terminated_length": 1358.0,
+      "completions/mean_length": 471.8125305175781,
+      "completions/mean_terminated_length": 471.8125305175781,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 1.7634769151405725,
+      "grad_norm": 0.5964041352272034,
+      "kl": 0.1002197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0084,
+      "num_tokens": 200441720.0,
+      "reward": 1.3169643878936768,
+      "reward_std": 0.09356345236301422,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3169642984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.34416329860687256,
+      "step": 1709
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1023.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 411.6339416503906,
+      "completions/mean_terminated_length": 411.6339416503906,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.7645086407015733,
+      "grad_norm": 0.757871150970459,
+      "kl": 0.0986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 200553826.0,
+      "reward": 1.4549108743667603,
+      "reward_std": 0.22730287909507751,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4549107253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.40263545513153076,
+      "step": 1710
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 836.0,
+      "completions/max_terminated_length": 836.0,
+      "completions/mean_length": 438.45538330078125,
+      "completions/mean_terminated_length": 438.45538330078125,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.7655403662625742,
+      "grad_norm": 0.7597662210464478,
+      "kl": 0.0989990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0229,
+      "num_tokens": 200674648.0,
+      "reward": 1.439732313156128,
+      "reward_std": 0.16961470246315002,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4397321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3784831464290619,
+      "step": 1711
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 886.0,
+      "completions/max_terminated_length": 886.0,
+      "completions/mean_length": 470.4375305175781,
+      "completions/mean_terminated_length": 470.4375305175781,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 1.766572091823575,
+      "grad_norm": 0.6943768858909607,
+      "kl": 0.098388671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 200801663.0,
+      "reward": 1.3165180683135986,
+      "reward_std": 0.18066294491291046,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3165178894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.34493690729141235,
+      "step": 1712
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 915.0,
+      "completions/max_terminated_length": 915.0,
+      "completions/mean_length": 457.0089416503906,
+      "completions/mean_terminated_length": 457.0089416503906,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.7676038173845758,
+      "grad_norm": 0.6977770328521729,
+      "kl": 0.0997314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0093,
+      "num_tokens": 200923107.0,
+      "reward": 1.387946605682373,
+      "reward_std": 0.19864727556705475,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3879464566707611,
+      "rewards/curriculum_aware_reward_fn/std": 0.3827499747276306,
+      "step": 1713
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 976.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 439.044677734375,
+      "completions/mean_terminated_length": 439.044677734375,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 1.7686355429455765,
+      "grad_norm": 0.807112991809845,
+      "kl": 0.107177734375,
+      "learning_rate": 1e-06,
+      "loss": 0.029,
+      "num_tokens": 201039802.0,
+      "reward": 1.4120537042617798,
+      "reward_std": 0.1486327052116394,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.37562233209609985,
+      "step": 1714
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 580.0,
+      "completions/max_terminated_length": 580.0,
+      "completions/mean_length": 377.1785888671875,
+      "completions/mean_terminated_length": 377.1785888671875,
+      "completions/min_length": 123.0,
+      "completions/min_terminated_length": 123.0,
+      "epoch": 1.7696672685065773,
+      "grad_norm": 0.7447263598442078,
+      "kl": 0.0958251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0012,
+      "num_tokens": 201139558.0,
+      "reward": 1.4366072416305542,
+      "reward_std": 0.16780443489551544,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.37734147906303406,
+      "step": 1715
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 897.0,
+      "completions/max_terminated_length": 897.0,
+      "completions/mean_length": 422.9732360839844,
+      "completions/mean_terminated_length": 422.9732360839844,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 1.770698994067578,
+      "grad_norm": 0.7423616051673889,
+      "kl": 0.0948486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0248,
+      "num_tokens": 201264379.0,
+      "reward": 1.2924107313156128,
+      "reward_std": 0.17788228392601013,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2924107015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.3585638105869293,
+      "step": 1716
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 761.0,
+      "completions/max_terminated_length": 761.0,
+      "completions/mean_length": 448.8839416503906,
+      "completions/mean_terminated_length": 448.8839416503906,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.7717307196285788,
+      "grad_norm": 0.655297577381134,
+      "kl": 0.0926513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0114,
+      "num_tokens": 201379523.0,
+      "reward": 1.3901787996292114,
+      "reward_std": 0.15744253993034363,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3901785910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.3982981741428375,
+      "step": 1717
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 754.0,
+      "completions/max_terminated_length": 754.0,
+      "completions/mean_length": 432.6875305175781,
+      "completions/mean_terminated_length": 432.6875305175781,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 1.7727624451895796,
+      "grad_norm": 0.8091326951980591,
+      "kl": 0.099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 201497606.0,
+      "reward": 1.3633930683135986,
+      "reward_std": 0.1997928023338318,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3633928894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.3603665828704834,
+      "step": 1718
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 984.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 476.89288330078125,
+      "completions/mean_terminated_length": 476.89288330078125,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.7737941707505804,
+      "grad_norm": 0.6853758692741394,
+      "kl": 0.0947265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0096,
+      "num_tokens": 201617784.0,
+      "reward": 1.3906251192092896,
+      "reward_std": 0.13729803264141083,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.390625,
+      "rewards/curriculum_aware_reward_fn/std": 0.3858727812767029,
+      "step": 1719
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 690.0,
+      "completions/max_terminated_length": 690.0,
+      "completions/mean_length": 384.0446472167969,
+      "completions/mean_terminated_length": 384.0446472167969,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 1.7748258963115813,
+      "grad_norm": 0.6238521933555603,
+      "kl": 0.102294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0047,
+      "num_tokens": 201722124.0,
+      "reward": 1.4767858982086182,
+      "reward_std": 0.10937154293060303,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.4864344000816345,
+      "step": 1720
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 782.0,
+      "completions/max_terminated_length": 782.0,
+      "completions/mean_length": 413.3482360839844,
+      "completions/mean_terminated_length": 413.3482360839844,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.775857621872582,
+      "grad_norm": 0.8579419851303101,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0224,
+      "num_tokens": 201827779.0,
+      "reward": 1.2959821224212646,
+      "reward_std": 0.234655499458313,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29598215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.36423057317733765,
+      "step": 1721
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 872.0,
+      "completions/max_terminated_length": 872.0,
+      "completions/mean_length": 438.26788330078125,
+      "completions/mean_terminated_length": 438.26788330078125,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 1.7768893474335825,
+      "grad_norm": 0.8060879707336426,
+      "kl": 0.0941162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0327,
+      "num_tokens": 201941937.0,
+      "reward": 1.2674108743667603,
+      "reward_std": 0.17359891533851624,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2674107253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.32719871401786804,
+      "step": 1722
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 811.0,
+      "completions/max_terminated_length": 811.0,
+      "completions/mean_length": 398.64288330078125,
+      "completions/mean_terminated_length": 398.64288330078125,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 1.7779210729945834,
+      "grad_norm": 0.6622222065925598,
+      "kl": 0.0894775390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0084,
+      "num_tokens": 202047618.0,
+      "reward": 1.4589287042617798,
+      "reward_std": 0.12163443863391876,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45892858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.39639222621917725,
+      "step": 1723
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1050.0,
+      "completions/max_terminated_length": 1050.0,
+      "completions/mean_length": 455.7500305175781,
+      "completions/mean_terminated_length": 455.7500305175781,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 1.7789527985555842,
+      "grad_norm": 0.7916659116744995,
+      "kl": 0.101806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0454,
+      "num_tokens": 202170846.0,
+      "reward": 1.4361608028411865,
+      "reward_std": 0.18718430399894714,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43616071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3482394516468048,
+      "step": 1724
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 979.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 426.9821472167969,
+      "completions/mean_terminated_length": 426.9821472167969,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 1.779984524116585,
+      "grad_norm": 0.719247043132782,
+      "kl": 0.0931396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0163,
+      "num_tokens": 202283768.0,
+      "reward": 1.3799108266830444,
+      "reward_std": 0.17570778727531433,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37991073727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.34271439909935,
+      "step": 1725
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 638.0,
+      "completions/max_terminated_length": 638.0,
+      "completions/mean_length": 402.0089416503906,
+      "completions/mean_terminated_length": 402.0089416503906,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.7810162496775859,
+      "grad_norm": 0.7211610078811646,
+      "kl": 0.0865478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 202392192.0,
+      "reward": 1.2428573369979858,
+      "reward_std": 0.22202114760875702,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24285714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.35105100274086,
+      "step": 1726
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 748.0,
+      "completions/max_terminated_length": 748.0,
+      "completions/mean_length": 437.5357360839844,
+      "completions/mean_terminated_length": 437.5357360839844,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "epoch": 1.7820479752385865,
+      "grad_norm": 0.7633671164512634,
+      "kl": 0.087646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.014,
+      "num_tokens": 202501578.0,
+      "reward": 1.3321430683135986,
+      "reward_std": 0.16112089157104492,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3321428596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.3619179129600525,
+      "step": 1727
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 707.0,
+      "completions/max_terminated_length": 707.0,
+      "completions/mean_length": 413.2946472167969,
+      "completions/mean_terminated_length": 413.2946472167969,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 1.7830797007995873,
+      "grad_norm": 0.703177273273468,
+      "kl": 0.091796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0085,
+      "num_tokens": 202614129.0,
+      "reward": 1.4330357313156128,
+      "reward_std": 0.15755793452262878,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.368986576795578,
+      "step": 1728
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 778.0,
+      "completions/max_terminated_length": 778.0,
+      "completions/mean_length": 341.8482360839844,
+      "completions/mean_terminated_length": 341.8482360839844,
+      "completions/min_length": 142.0,
+      "completions/min_terminated_length": 142.0,
+      "epoch": 1.784111426360588,
+      "grad_norm": 0.7756032347679138,
+      "kl": 0.103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0035,
+      "num_tokens": 202709200.0,
+      "reward": 1.5513393878936768,
+      "reward_std": 0.1884058266878128,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.551339328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.44415420293807983,
+      "step": 1729
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 433.52679443359375,
+      "completions/mean_terminated_length": 433.52679443359375,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 1.7851431519215888,
+      "grad_norm": 0.7666996717453003,
+      "kl": 0.0926513671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0093,
+      "num_tokens": 202822050.0,
+      "reward": 1.3267858028411865,
+      "reward_std": 0.18164461851119995,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32678571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.35829484462738037,
+      "step": 1730
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 874.0,
+      "completions/max_terminated_length": 874.0,
+      "completions/mean_length": 385.1875305175781,
+      "completions/mean_terminated_length": 385.1875305175781,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 1.7861748774825896,
+      "grad_norm": 0.796438455581665,
+      "kl": 0.089111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0073,
+      "num_tokens": 202927190.0,
+      "reward": 1.4678572416305542,
+      "reward_std": 0.1948614865541458,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4678571820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.3767366409301758,
+      "step": 1731
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 840.0,
+      "completions/max_terminated_length": 840.0,
+      "completions/mean_length": 418.39288330078125,
+      "completions/mean_terminated_length": 418.39288330078125,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.7872066030435905,
+      "grad_norm": 0.4892652630805969,
+      "kl": 0.0892333984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0075,
+      "num_tokens": 203033690.0,
+      "reward": 1.2924108505249023,
+      "reward_std": 0.04891003295779228,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2924107015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.3325583040714264,
+      "step": 1732
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1191.0,
+      "completions/max_terminated_length": 1191.0,
+      "completions/mean_length": 416.01788330078125,
+      "completions/mean_terminated_length": 416.01788330078125,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.7882383286045913,
+      "grad_norm": 0.7362436056137085,
+      "kl": 0.09326171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0104,
+      "num_tokens": 203150830.0,
+      "reward": 1.4714287519454956,
+      "reward_std": 0.16051512956619263,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47142860293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.3712478578090668,
+      "step": 1733
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 992.0,
+      "completions/max_terminated_length": 992.0,
+      "completions/mean_length": 411.89288330078125,
+      "completions/mean_terminated_length": 411.89288330078125,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 1.789270054165592,
+      "grad_norm": 0.6482226252555847,
+      "kl": 0.094482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 203261152.0,
+      "reward": 1.4950894117355347,
+      "reward_std": 0.12809297442436218,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.3854890465736389,
+      "step": 1734
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 760.0,
+      "completions/max_terminated_length": 760.0,
+      "completions/mean_length": 430.3839416503906,
+      "completions/mean_terminated_length": 430.3839416503906,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 1.7903017797265928,
+      "grad_norm": 0.7845043540000916,
+      "kl": 0.108642578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0281,
+      "num_tokens": 203385046.0,
+      "reward": 1.2857143878936768,
+      "reward_std": 0.20165906846523285,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2857142984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.34336888790130615,
+      "step": 1735
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 782.0,
+      "completions/max_terminated_length": 782.0,
+      "completions/mean_length": 424.4375305175781,
+      "completions/mean_terminated_length": 424.4375305175781,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.7913335052875934,
+      "grad_norm": 0.7672021389007568,
+      "kl": 0.102294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0143,
+      "num_tokens": 203506601.0,
+      "reward": 1.3767858743667603,
+      "reward_std": 0.15465092658996582,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3767857253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.37059295177459717,
+      "step": 1736
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 845.0,
+      "completions/max_terminated_length": 845.0,
+      "completions/mean_length": 445.8125305175781,
+      "completions/mean_terminated_length": 445.8125305175781,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 1.7923652308485942,
+      "grad_norm": 0.7034952640533447,
+      "kl": 0.10009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0168,
+      "num_tokens": 203632139.0,
+      "reward": 1.307142972946167,
+      "reward_std": 0.19115585088729858,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30714288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3540535867214203,
+      "step": 1737
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 829.0,
+      "completions/max_terminated_length": 829.0,
+      "completions/mean_length": 441.294677734375,
+      "completions/mean_terminated_length": 441.294677734375,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 1.793396956409595,
+      "grad_norm": 0.8159340620040894,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0095,
+      "num_tokens": 203760027.0,
+      "reward": 1.2946430444717407,
+      "reward_std": 0.17885927855968475,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2946428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.32238954305648804,
+      "step": 1738
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 795.0,
+      "completions/max_terminated_length": 795.0,
+      "completions/mean_length": 412.3035888671875,
+      "completions/mean_terminated_length": 412.3035888671875,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 1.794428681970596,
+      "grad_norm": 0.7434394955635071,
+      "kl": 0.10009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0273,
+      "num_tokens": 203862289.0,
+      "reward": 1.3888394832611084,
+      "reward_std": 0.17571541666984558,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3888392746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.35461628437042236,
+      "step": 1739
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 883.0,
+      "completions/max_terminated_length": 883.0,
+      "completions/mean_length": 438.6696472167969,
+      "completions/mean_terminated_length": 438.6696472167969,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 1.7954604075315967,
+      "grad_norm": 0.7299800515174866,
+      "kl": 0.097412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 203986124.0,
+      "reward": 1.2584823369979858,
+      "reward_std": 0.16149833798408508,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2584821581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.33657577633857727,
+      "step": 1740
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1080.0,
+      "completions/max_terminated_length": 1080.0,
+      "completions/mean_length": 421.58038330078125,
+      "completions/mean_terminated_length": 421.58038330078125,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.7964921330925974,
+      "grad_norm": 0.6581730842590332,
+      "kl": 0.1060791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0217,
+      "num_tokens": 204098350.0,
+      "reward": 1.3035714626312256,
+      "reward_std": 0.13841554522514343,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3035714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.34833672642707825,
+      "step": 1741
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 653.0,
+      "completions/max_terminated_length": 653.0,
+      "completions/mean_length": 407.1607360839844,
+      "completions/mean_terminated_length": 407.1607360839844,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 1.797523858653598,
+      "grad_norm": 0.7710352540016174,
+      "kl": 0.09814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0134,
+      "num_tokens": 204209640.0,
+      "reward": 1.5558037757873535,
+      "reward_std": 0.17522591352462769,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5558035969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.35878583788871765,
+      "step": 1742
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 784.0,
+      "completions/max_terminated_length": 784.0,
+      "completions/mean_length": 424.6964416503906,
+      "completions/mean_terminated_length": 424.6964416503906,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 1.7985555842145988,
+      "grad_norm": 0.8840100765228271,
+      "kl": 0.1064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0236,
+      "num_tokens": 204322936.0,
+      "reward": 1.386160969734192,
+      "reward_std": 0.18574056029319763,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3861607015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.36442041397094727,
+      "step": 1743
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 756.0,
+      "completions/max_terminated_length": 756.0,
+      "completions/mean_length": 431.20538330078125,
+      "completions/mean_terminated_length": 431.20538330078125,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "epoch": 1.7995873097755997,
+      "grad_norm": 0.801786482334137,
+      "kl": 0.099853515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0202,
+      "num_tokens": 204449673.0,
+      "reward": 1.432142972946167,
+      "reward_std": 0.19646649062633514,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43214288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.37283048033714294,
+      "step": 1744
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 979.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 379.9196472167969,
+      "completions/mean_terminated_length": 379.9196472167969,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 1.8006190353366005,
+      "grad_norm": 0.7736554741859436,
+      "kl": 0.12744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0149,
+      "num_tokens": 204562252.0,
+      "reward": 1.4883930683135986,
+      "reward_std": 0.12401814758777618,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4883928894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.3882846534252167,
+      "step": 1745
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 725.0,
+      "completions/max_terminated_length": 725.0,
+      "completions/mean_length": 445.1607360839844,
+      "completions/mean_terminated_length": 445.1607360839844,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 1.8016507608976013,
+      "grad_norm": 0.8602623343467712,
+      "kl": 0.105224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0268,
+      "num_tokens": 204678911.0,
+      "reward": 1.4656250476837158,
+      "reward_std": 0.24137361347675323,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.4218166768550873,
+      "step": 1746
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1195.0,
+      "completions/max_terminated_length": 1195.0,
+      "completions/mean_length": 441.6785888671875,
+      "completions/mean_terminated_length": 441.6785888671875,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.802682486458602,
+      "grad_norm": 0.659216046333313,
+      "kl": 0.0859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0055,
+      "num_tokens": 204797028.0,
+      "reward": 1.4169642925262451,
+      "reward_std": 0.11663386970758438,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4169642925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.4146828353404999,
+      "step": 1747
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 744.0,
+      "completions/max_terminated_length": 744.0,
+      "completions/mean_length": 418.1964416503906,
+      "completions/mean_terminated_length": 418.1964416503906,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 1.8037142120196028,
+      "grad_norm": 0.7895244359970093,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 204921804.0,
+      "reward": 1.3651787042617798,
+      "reward_std": 0.1474582552909851,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36517858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3904743790626526,
+      "step": 1748
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 742.0,
+      "completions/max_terminated_length": 742.0,
+      "completions/mean_length": 430.0357360839844,
+      "completions/mean_terminated_length": 430.0357360839844,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 1.8047459375806034,
+      "grad_norm": 0.8099697232246399,
+      "kl": 0.09326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.006,
+      "num_tokens": 205040122.0,
+      "reward": 1.3459821939468384,
+      "reward_std": 0.2298395037651062,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3549107015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.4184354841709137,
+      "step": 1749
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 878.0,
+      "completions/max_terminated_length": 878.0,
+      "completions/mean_length": 431.3571472167969,
+      "completions/mean_terminated_length": 431.3571472167969,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.8057776631416043,
+      "grad_norm": 0.600109338760376,
+      "kl": 0.09130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0148,
+      "num_tokens": 205154024.0,
+      "reward": 1.3263393640518188,
+      "reward_std": 0.09809020906686783,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3263392746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.4202616214752197,
+      "step": 1750
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 703.0,
+      "completions/max_terminated_length": 703.0,
+      "completions/mean_length": 410.6785888671875,
+      "completions/mean_terminated_length": 410.6785888671875,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 1.806809388702605,
+      "grad_norm": 0.8304281234741211,
+      "kl": 0.1104736328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0238,
+      "num_tokens": 205262026.0,
+      "reward": 1.4589287042617798,
+      "reward_std": 0.2656938135623932,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47678568959236145,
+      "rewards/curriculum_aware_reward_fn/std": 0.4030809700489044,
+      "step": 1751
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1187.0,
+      "completions/max_terminated_length": 1187.0,
+      "completions/mean_length": 411.7946472167969,
+      "completions/mean_terminated_length": 411.7946472167969,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 1.807841114263606,
+      "grad_norm": 0.5996778011322021,
+      "kl": 0.0955810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0341,
+      "num_tokens": 205372232.0,
+      "reward": 1.3665179014205933,
+      "reward_std": 0.12944695353507996,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3665178418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.3909095525741577,
+      "step": 1752
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1026.0,
+      "completions/max_terminated_length": 1026.0,
+      "completions/mean_length": 408.6071472167969,
+      "completions/mean_terminated_length": 408.6071472167969,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 1.8088728398246068,
+      "grad_norm": 0.7588355541229248,
+      "kl": 0.0958251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 205492504.0,
+      "reward": 1.5517857074737549,
+      "reward_std": 0.18035660684108734,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5517856478691101,
+      "rewards/curriculum_aware_reward_fn/std": 0.44508975744247437,
+      "step": 1753
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 813.0,
+      "completions/max_terminated_length": 813.0,
+      "completions/mean_length": 431.0000305175781,
+      "completions/mean_terminated_length": 431.0000305175781,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 1.8099045653856074,
+      "grad_norm": 0.770481526851654,
+      "kl": 0.0980224609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0309,
+      "num_tokens": 205607066.0,
+      "reward": 1.4549108743667603,
+      "reward_std": 0.14845071732997894,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4549106955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.41622281074523926,
+      "step": 1754
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 677.0,
+      "completions/max_terminated_length": 677.0,
+      "completions/mean_length": 393.0000305175781,
+      "completions/mean_terminated_length": 393.0000305175781,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.8109362909466082,
+      "grad_norm": 0.7532888054847717,
+      "kl": 0.0965576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0078,
+      "num_tokens": 205712038.0,
+      "reward": 1.3513394594192505,
+      "reward_std": 0.1509929597377777,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35133928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.42418229579925537,
+      "step": 1755
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 872.0,
+      "completions/max_terminated_length": 872.0,
+      "completions/mean_length": 446.6785888671875,
+      "completions/mean_terminated_length": 446.6785888671875,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.8119680165076089,
+      "grad_norm": 0.7780160903930664,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0138,
+      "num_tokens": 205833949.0,
+      "reward": 1.2696428298950195,
+      "reward_std": 0.20583878457546234,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2696428596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.3794470727443695,
+      "step": 1756
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 670.0,
+      "completions/max_terminated_length": 670.0,
+      "completions/mean_length": 359.6071472167969,
+      "completions/mean_terminated_length": 359.6071472167969,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.8129997420686097,
+      "grad_norm": 0.631820023059845,
+      "kl": 0.1031494140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0187,
+      "num_tokens": 205930848.0,
+      "reward": 1.600000023841858,
+      "reward_std": 0.08937109261751175,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.42181503772735596,
+      "step": 1757
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 979.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 444.5089416503906,
+      "completions/mean_terminated_length": 444.5089416503906,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 1.8140314676296105,
+      "grad_norm": 0.8031406402587891,
+      "kl": 0.0958251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 206046517.0,
+      "reward": 1.399553656578064,
+      "reward_std": 0.17127352952957153,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3995535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.4392610490322113,
+      "step": 1758
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 682.0,
+      "completions/max_terminated_length": 682.0,
+      "completions/mean_length": 370.4910888671875,
+      "completions/mean_terminated_length": 370.4910888671875,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 1.8150631931906114,
+      "grad_norm": 0.7603376507759094,
+      "kl": 0.1036376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 206148289.0,
+      "reward": 1.7169642448425293,
+      "reward_std": 0.1733575463294983,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7169643044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3697182834148407,
+      "step": 1759
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 702.0,
+      "completions/max_terminated_length": 702.0,
+      "completions/mean_length": 425.9375305175781,
+      "completions/mean_terminated_length": 425.9375305175781,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 1.8160949187516122,
+      "grad_norm": 0.81327223777771,
+      "kl": 0.1011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0403,
+      "num_tokens": 206261311.0,
+      "reward": 1.4276787042617798,
+      "reward_std": 0.20803497731685638,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.4219666123390198,
+      "step": 1760
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 702.0,
+      "completions/max_terminated_length": 702.0,
+      "completions/mean_length": 409.52679443359375,
+      "completions/mean_terminated_length": 409.52679443359375,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.8171266443126128,
+      "grad_norm": 0.8098947405815125,
+      "kl": 0.10400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 206377388.0,
+      "reward": 1.594642996788025,
+      "reward_std": 0.3190336227416992,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5946428179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.48543453216552734,
+      "step": 1761
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1371.0,
+      "completions/max_terminated_length": 1371.0,
+      "completions/mean_length": 454.1607360839844,
+      "completions/mean_terminated_length": 454.1607360839844,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 1.8181583698736135,
+      "grad_norm": 0.7259615659713745,
+      "kl": 0.095458984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0238,
+      "num_tokens": 206494936.0,
+      "reward": 1.5138393640518188,
+      "reward_std": 0.18669991195201874,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5138392448425293,
+      "rewards/curriculum_aware_reward_fn/std": 0.4641280770301819,
+      "step": 1762
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 478.3214416503906,
+      "completions/mean_terminated_length": 478.3214416503906,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 1.8191900954346143,
+      "grad_norm": 0.7194581031799316,
+      "kl": 0.1046142578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0265,
+      "num_tokens": 206617808.0,
+      "reward": 1.4325894117355347,
+      "reward_std": 0.19652755558490753,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43258926272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.39896881580352783,
+      "step": 1763
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2077.0,
+      "completions/max_terminated_length": 2077.0,
+      "completions/mean_length": 488.3214416503906,
+      "completions/mean_terminated_length": 488.3214416503906,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.8202218209956151,
+      "grad_norm": 0.7811593413352966,
+      "kl": 0.0987548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0242,
+      "num_tokens": 206734870.0,
+      "reward": 1.3196429014205933,
+      "reward_std": 0.2123473733663559,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3196428418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.3914259672164917,
+      "step": 1764
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 861.0,
+      "completions/max_terminated_length": 861.0,
+      "completions/mean_length": 430.2857360839844,
+      "completions/mean_terminated_length": 430.2857360839844,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 1.821253546556616,
+      "grad_norm": 0.6582061648368835,
+      "kl": 0.1060791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0062,
+      "num_tokens": 206848246.0,
+      "reward": 1.4267858266830444,
+      "reward_std": 0.1399049013853073,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4267857074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.44957080483436584,
+      "step": 1765
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 986.0,
+      "completions/max_terminated_length": 986.0,
+      "completions/mean_length": 409.4821472167969,
+      "completions/mean_terminated_length": 409.4821472167969,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.8222852721176168,
+      "grad_norm": 0.7228449583053589,
+      "kl": 0.1021728515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 206957479.0,
+      "reward": 1.4669643640518188,
+      "reward_std": 0.20963212847709656,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4669642746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.44268599152565,
+      "step": 1766
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 716.0,
+      "completions/max_terminated_length": 716.0,
+      "completions/mean_length": 437.0982360839844,
+      "completions/mean_terminated_length": 437.0982360839844,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 1.8233169976786174,
+      "grad_norm": 0.8272615075111389,
+      "kl": 0.108642578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0006,
+      "num_tokens": 207069854.0,
+      "reward": 1.3575893640518188,
+      "reward_std": 0.22051258385181427,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3575893044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3869032859802246,
+      "step": 1767
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 899.0,
+      "completions/max_terminated_length": 899.0,
+      "completions/mean_length": 451.5089416503906,
+      "completions/mean_terminated_length": 451.5089416503906,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.8243487232396183,
+      "grad_norm": 0.8047498464584351,
+      "kl": 0.10888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0033,
+      "num_tokens": 207184970.0,
+      "reward": 1.3651785850524902,
+      "reward_std": 0.19732783734798431,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36517858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.40862545371055603,
+      "step": 1768
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 461.15179443359375,
+      "completions/mean_terminated_length": 461.15179443359375,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 1.825380448800619,
+      "grad_norm": 0.7979851961135864,
+      "kl": 0.1019287109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0003,
+      "num_tokens": 207306444.0,
+      "reward": 1.2544643878936768,
+      "reward_std": 0.1744467318058014,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2544642984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.30913665890693665,
+      "step": 1769
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 843.0,
+      "completions/max_terminated_length": 843.0,
+      "completions/mean_length": 467.232177734375,
+      "completions/mean_terminated_length": 467.232177734375,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 1.8264121743616197,
+      "grad_norm": 0.7705732583999634,
+      "kl": 0.10009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0121,
+      "num_tokens": 207430769.0,
+      "reward": 1.4004465341567993,
+      "reward_std": 0.14401130378246307,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40044644474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.41549360752105713,
+      "step": 1770
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 899.0,
+      "completions/max_terminated_length": 899.0,
+      "completions/mean_length": 471.01788330078125,
+      "completions/mean_terminated_length": 471.01788330078125,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "epoch": 1.8274438999226206,
+      "grad_norm": 0.7397488355636597,
+      "kl": 0.091796875,
+      "learning_rate": 1e-06,
+      "loss": -0.007,
+      "num_tokens": 207554125.0,
+      "reward": 1.4316965341567993,
+      "reward_std": 0.2323623150587082,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43169641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.4476664066314697,
+      "step": 1771
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 455.2500305175781,
+      "completions/mean_terminated_length": 455.2500305175781,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 1.8284756254836214,
+      "grad_norm": 0.8286288380622864,
+      "kl": 0.1236572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0351,
+      "num_tokens": 207670976.0,
+      "reward": 1.411607265472412,
+      "reward_std": 0.2313155084848404,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41160711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.38007715344429016,
+      "step": 1772
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 898.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 467.8750305175781,
+      "completions/mean_terminated_length": 467.8750305175781,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 1.8295073510446223,
+      "grad_norm": 0.7537168860435486,
+      "kl": 0.1103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 207787561.0,
+      "reward": 1.376339316368103,
+      "reward_std": 0.24346770346164703,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37633928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3885732591152191,
+      "step": 1773
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1012.0,
+      "completions/max_terminated_length": 1012.0,
+      "completions/mean_length": 454.4107360839844,
+      "completions/mean_terminated_length": 454.4107360839844,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 1.8305390766056229,
+      "grad_norm": 0.6410990953445435,
+      "kl": 0.103759765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0015,
+      "num_tokens": 207905732.0,
+      "reward": 1.3656251430511475,
+      "reward_std": 0.14624857902526855,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3656249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.4483719766139984,
+      "step": 1774
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1396.0,
+      "completions/max_terminated_length": 1396.0,
+      "completions/mean_length": 499.15179443359375,
+      "completions/mean_terminated_length": 499.15179443359375,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.8315708021666237,
+      "grad_norm": 0.7663962841033936,
+      "kl": 0.107177734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0196,
+      "num_tokens": 208023608.0,
+      "reward": 1.322767972946167,
+      "reward_std": 0.17575961351394653,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32276788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.39013099670410156,
+      "step": 1775
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1136.0,
+      "completions/max_terminated_length": 1136.0,
+      "completions/mean_length": 465.7589416503906,
+      "completions/mean_terminated_length": 465.7589416503906,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.8326025277276243,
+      "grad_norm": 0.6929665803909302,
+      "kl": 0.0963134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0146,
+      "num_tokens": 208142805.0,
+      "reward": 1.4830358028411865,
+      "reward_std": 0.1856861263513565,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48303574323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.4536415934562683,
+      "step": 1776
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 842.0,
+      "completions/max_terminated_length": 842.0,
+      "completions/mean_length": 457.0714416503906,
+      "completions/mean_terminated_length": 457.0714416503906,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 1.8336342532886252,
+      "grad_norm": 0.639695942401886,
+      "kl": 0.1048583984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0176,
+      "num_tokens": 208263640.0,
+      "reward": 1.345089316368103,
+      "reward_std": 0.14022766053676605,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34508928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4171416759490967,
+      "step": 1777
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 849.0,
+      "completions/max_terminated_length": 849.0,
+      "completions/mean_length": 476.6250305175781,
+      "completions/mean_terminated_length": 476.6250305175781,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 1.834665978849626,
+      "grad_norm": 0.6727584004402161,
+      "kl": 0.10595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0178,
+      "num_tokens": 208385531.0,
+      "reward": 1.3147321939468384,
+      "reward_std": 0.20331941545009613,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3147321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.38913384079933167,
+      "step": 1778
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 893.0,
+      "completions/max_terminated_length": 893.0,
+      "completions/mean_length": 427.3214416503906,
+      "completions/mean_terminated_length": 427.3214416503906,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 1.8356977044106269,
+      "grad_norm": 0.7415899038314819,
+      "kl": 0.104736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0259,
+      "num_tokens": 208503153.0,
+      "reward": 1.553125023841858,
+      "reward_std": 0.24731476604938507,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5531249642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.43346917629241943,
+      "step": 1779
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 956.0,
+      "completions/max_terminated_length": 956.0,
+      "completions/mean_length": 451.3839416503906,
+      "completions/mean_terminated_length": 451.3839416503906,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.8367294299716277,
+      "grad_norm": 0.7726976275444031,
+      "kl": 0.1026611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0245,
+      "num_tokens": 208618682.0,
+      "reward": 1.4790178537368774,
+      "reward_std": 0.2106853425502777,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47901788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.4393196105957031,
+      "step": 1780
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 911.0,
+      "completions/max_terminated_length": 911.0,
+      "completions/mean_length": 490.01788330078125,
+      "completions/mean_terminated_length": 490.01788330078125,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 1.8377611555326283,
+      "grad_norm": 0.8864200115203857,
+      "kl": 0.1116943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0219,
+      "num_tokens": 208744215.0,
+      "reward": 1.4366071224212646,
+      "reward_std": 0.30744442343711853,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.4401567280292511,
+      "step": 1781
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 965.0,
+      "completions/max_terminated_length": 965.0,
+      "completions/mean_length": 426.9285888671875,
+      "completions/mean_terminated_length": 426.9285888671875,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 1.8387928810936292,
+      "grad_norm": 0.6458997130393982,
+      "kl": 0.1063232421875,
+      "learning_rate": 1e-06,
+      "loss": 0.027,
+      "num_tokens": 208861933.0,
+      "reward": 1.6839287281036377,
+      "reward_std": 0.16594833135604858,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6839285492897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.39713016152381897,
+      "step": 1782
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 777.0,
+      "completions/max_terminated_length": 777.0,
+      "completions/mean_length": 453.6785888671875,
+      "completions/mean_terminated_length": 453.6785888671875,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 1.8398246066546298,
+      "grad_norm": 0.821609377861023,
+      "kl": 0.1063232421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 208980379.0,
+      "reward": 1.4950894117355347,
+      "reward_std": 0.28672200441360474,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49508926272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.44076913595199585,
+      "step": 1783
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 971.0,
+      "completions/max_terminated_length": 971.0,
+      "completions/mean_length": 454.5982360839844,
+      "completions/mean_terminated_length": 454.5982360839844,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 1.8408563322156306,
+      "grad_norm": 0.7601539492607117,
+      "kl": 0.100341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0128,
+      "num_tokens": 209104171.0,
+      "reward": 1.4446429014205933,
+      "reward_std": 0.21672558784484863,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4446428418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4232732355594635,
+      "step": 1784
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1097.0,
+      "completions/max_terminated_length": 1097.0,
+      "completions/mean_length": 512.4375,
+      "completions/mean_terminated_length": 512.4375,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 1.8418880577766314,
+      "grad_norm": 0.7791252732276917,
+      "kl": 0.1077880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 209236793.0,
+      "reward": 1.2834821939468384,
+      "reward_std": 0.20960944890975952,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2834821343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3648638129234314,
+      "step": 1785
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 707.0,
+      "completions/max_terminated_length": 707.0,
+      "completions/mean_length": 451.607177734375,
+      "completions/mean_terminated_length": 451.607177734375,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 1.8429197833376323,
+      "grad_norm": 0.7502228617668152,
+      "kl": 0.0966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0048,
+      "num_tokens": 209355903.0,
+      "reward": 1.3714288473129272,
+      "reward_std": 0.20309315621852875,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37142854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3378998637199402,
+      "step": 1786
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 668.0,
+      "completions/max_terminated_length": 668.0,
+      "completions/mean_length": 416.0089416503906,
+      "completions/mean_terminated_length": 416.0089416503906,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 1.8439515088986331,
+      "grad_norm": 0.8292548656463623,
+      "kl": 0.101318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0298,
+      "num_tokens": 209468996.0,
+      "reward": 1.6040180921554565,
+      "reward_std": 0.2973346710205078,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6040178537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.43434420228004456,
+      "step": 1787
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1027.0,
+      "completions/max_terminated_length": 1027.0,
+      "completions/mean_length": 495.5000305175781,
+      "completions/mean_terminated_length": 495.5000305175781,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 1.8449832344596337,
+      "grad_norm": 0.7564486861228943,
+      "kl": 0.098876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0146,
+      "num_tokens": 209600330.0,
+      "reward": 1.4656251668930054,
+      "reward_std": 0.20407995581626892,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.37907135486602783,
+      "step": 1788
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 771.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 426.65179443359375,
+      "completions/mean_terminated_length": 426.65179443359375,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 1.8460149600206344,
+      "grad_norm": 0.7819154262542725,
+      "kl": 0.1094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0086,
+      "num_tokens": 209712473.0,
+      "reward": 1.4290179014205933,
+      "reward_std": 0.19162015616893768,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4290178418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.41669216752052307,
+      "step": 1789
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 908.0,
+      "completions/max_terminated_length": 908.0,
+      "completions/mean_length": 452.5982360839844,
+      "completions/mean_terminated_length": 452.5982360839844,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 1.8470466855816352,
+      "grad_norm": 0.6342150568962097,
+      "kl": 0.0968017578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0137,
+      "num_tokens": 209828243.0,
+      "reward": 1.5089287757873535,
+      "reward_std": 0.1409773826599121,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5089285969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.4546166658401489,
+      "step": 1790
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 977.0,
+      "completions/max_terminated_length": 977.0,
+      "completions/mean_length": 456.794677734375,
+      "completions/mean_terminated_length": 456.794677734375,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 1.848078411142636,
+      "grad_norm": 0.7057480812072754,
+      "kl": 0.10302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 209951112.0,
+      "reward": 1.4026787281036377,
+      "reward_std": 0.21222275495529175,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40267854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.411536306142807,
+      "step": 1791
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 726.0,
+      "completions/max_terminated_length": 726.0,
+      "completions/mean_length": 445.669677734375,
+      "completions/mean_terminated_length": 445.669677734375,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 1.8491101367036369,
+      "grad_norm": 0.7759240865707397,
+      "kl": 0.1025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 210064846.0,
+      "reward": 1.3375000953674316,
+      "reward_std": 0.18568792939186096,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3734125792980194,
+      "step": 1792
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 667.0,
+      "completions/max_terminated_length": 667.0,
+      "completions/mean_length": 353.2321472167969,
+      "completions/mean_terminated_length": 353.2321472167969,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 1.8501418622646377,
+      "grad_norm": 0.6455899477005005,
+      "kl": 0.1112060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 210162501.0,
+      "reward": 1.4767858982086182,
+      "reward_std": 0.15260063111782074,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.46197426319122314,
+      "step": 1793
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1048.0,
+      "completions/max_terminated_length": 1048.0,
+      "completions/mean_length": 444.15179443359375,
+      "completions/mean_terminated_length": 444.15179443359375,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 1.8511735878256383,
+      "grad_norm": 0.7108426094055176,
+      "kl": 0.109130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0269,
+      "num_tokens": 210287020.0,
+      "reward": 1.6000001430511475,
+      "reward_std": 0.19978836178779602,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.43335065245628357,
+      "step": 1794
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1021.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 470.46429443359375,
+      "completions/mean_terminated_length": 470.46429443359375,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 1.8522053133866392,
+      "grad_norm": 0.8278527855873108,
+      "kl": 0.10302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 210412449.0,
+      "reward": 1.411607265472412,
+      "reward_std": 0.2243293821811676,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41160711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.430115282535553,
+      "step": 1795
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 750.0,
+      "completions/max_terminated_length": 750.0,
+      "completions/mean_length": 398.8125305175781,
+      "completions/mean_terminated_length": 398.8125305175781,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 1.8532370389476398,
+      "grad_norm": 0.7692654728889465,
+      "kl": 0.107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0051,
+      "num_tokens": 210522580.0,
+      "reward": 1.4446429014205933,
+      "reward_std": 0.2773749828338623,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4446428418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.46165376901626587,
+      "step": 1796
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 901.0,
+      "completions/max_terminated_length": 901.0,
+      "completions/mean_length": 405.1696472167969,
+      "completions/mean_terminated_length": 405.1696472167969,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 1.8542687645086406,
+      "grad_norm": 0.9778786897659302,
+      "kl": 0.1536865234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0176,
+      "num_tokens": 210629895.0,
+      "reward": 1.532142996788025,
+      "reward_std": 0.22348810732364655,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5321428775787354,
+      "rewards/curriculum_aware_reward_fn/std": 0.4495171010494232,
+      "step": 1797
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 808.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 429.58929443359375,
+      "completions/mean_terminated_length": 429.58929443359375,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.8553004900696415,
+      "grad_norm": 0.8595057129859924,
+      "kl": 0.0970458984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0289,
+      "num_tokens": 210747470.0,
+      "reward": 1.4549108743667603,
+      "reward_std": 0.1990305483341217,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4549107253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.4242619276046753,
+      "step": 1798
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1201.0,
+      "completions/max_terminated_length": 1201.0,
+      "completions/mean_length": 465.1339416503906,
+      "completions/mean_terminated_length": 465.1339416503906,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.8563322156306423,
+      "grad_norm": 0.7282925844192505,
+      "kl": 0.0888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 210867766.0,
+      "reward": 1.500892996788025,
+      "reward_std": 0.22749468684196472,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5008928179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.4586988091468811,
+      "step": 1799
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1087.0,
+      "completions/max_terminated_length": 1087.0,
+      "completions/mean_length": 453.3839416503906,
+      "completions/mean_terminated_length": 453.3839416503906,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 1.8573639411916432,
+      "grad_norm": 0.7182178497314453,
+      "kl": 0.1009521484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0028,
+      "num_tokens": 210981307.0,
+      "reward": 1.3674107789993286,
+      "reward_std": 0.22563178837299347,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.4080667495727539,
+      "step": 1800
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 771.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 394.45538330078125,
+      "completions/mean_terminated_length": 394.45538330078125,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.8583956667526438,
+      "grad_norm": 0.7896085977554321,
+      "kl": 0.1163330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0232,
+      "num_tokens": 211086596.0,
+      "reward": 1.6044644117355347,
+      "reward_std": 0.1873040646314621,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6044642329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.4967712461948395,
+      "step": 1801
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 420.95538330078125,
+      "completions/mean_terminated_length": 420.95538330078125,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 1.8594273923136446,
+      "grad_norm": 0.8211988806724548,
+      "kl": 0.119384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 211204093.0,
+      "reward": 1.5977680683135986,
+      "reward_std": 0.21861636638641357,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5977678298950195,
+      "rewards/curriculum_aware_reward_fn/std": 0.4041111171245575,
+      "step": 1802
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 789.0,
+      "completions/max_terminated_length": 789.0,
+      "completions/mean_length": 418.3750305175781,
+      "completions/mean_terminated_length": 418.3750305175781,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 1.8604591178746452,
+      "grad_norm": 0.7635685205459595,
+      "kl": 0.1046142578125,
+      "learning_rate": 1e-06,
+      "loss": -0.007,
+      "num_tokens": 211315358.0,
+      "reward": 1.289285659790039,
+      "reward_std": 0.1520286202430725,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.28928571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.37724870443344116,
+      "step": 1803
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 635.0,
+      "completions/max_terminated_length": 635.0,
+      "completions/mean_length": 389.8839416503906,
+      "completions/mean_terminated_length": 389.8839416503906,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.861490843435646,
+      "grad_norm": 0.7321354150772095,
+      "kl": 0.1064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0067,
+      "num_tokens": 211423880.0,
+      "reward": 1.5392858982086182,
+      "reward_std": 0.17979271709918976,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5392857193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.4290902018547058,
+      "step": 1804
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1385.0,
+      "completions/max_terminated_length": 1385.0,
+      "completions/mean_length": 468.4464416503906,
+      "completions/mean_terminated_length": 468.4464416503906,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 1.862522568996647,
+      "grad_norm": 0.6947965025901794,
+      "kl": 0.117431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0201,
+      "num_tokens": 211542552.0,
+      "reward": 1.4254463911056519,
+      "reward_std": 0.18100708723068237,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43437501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.4794906675815582,
+      "step": 1805
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1368.0,
+      "completions/max_terminated_length": 1368.0,
+      "completions/mean_length": 430.95538330078125,
+      "completions/mean_terminated_length": 430.95538330078125,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 1.8635542945576478,
+      "grad_norm": 0.6890603303909302,
+      "kl": 0.1099853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.012,
+      "num_tokens": 211663024.0,
+      "reward": 1.3941963911056519,
+      "reward_std": 0.12359840422868729,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39419645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.42228174209594727,
+      "step": 1806
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1509.0,
+      "completions/max_terminated_length": 1509.0,
+      "completions/mean_length": 462.1875305175781,
+      "completions/mean_terminated_length": 462.1875305175781,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 1.8645860201186486,
+      "grad_norm": 0.7871446013450623,
+      "kl": 0.1162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0177,
+      "num_tokens": 211785636.0,
+      "reward": 1.46473228931427,
+      "reward_std": 0.22002539038658142,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4647321403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.5697572827339172,
+      "step": 1807
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 788.0,
+      "completions/max_terminated_length": 788.0,
+      "completions/mean_length": 429.65179443359375,
+      "completions/mean_terminated_length": 429.65179443359375,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 1.8656177456796492,
+      "grad_norm": 0.614851713180542,
+      "kl": 0.1131591796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0053,
+      "num_tokens": 211902861.0,
+      "reward": 1.4138394594192505,
+      "reward_std": 0.10892920196056366,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41383928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.44343101978302,
+      "step": 1808
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1656.0,
+      "completions/max_terminated_length": 1656.0,
+      "completions/mean_length": 482.982177734375,
+      "completions/mean_terminated_length": 482.982177734375,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.8666494712406498,
+      "grad_norm": 0.6550900340080261,
+      "kl": 0.107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0188,
+      "num_tokens": 212023945.0,
+      "reward": 1.2910715341567993,
+      "reward_std": 0.13532520830631256,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29107141494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.4194169044494629,
+      "step": 1809
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1476.0,
+      "completions/max_terminated_length": 1476.0,
+      "completions/mean_length": 448.6964416503906,
+      "completions/mean_terminated_length": 448.6964416503906,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 1.8676811968016507,
+      "grad_norm": 0.7046249508857727,
+      "kl": 0.1085205078125,
+      "learning_rate": 1e-06,
+      "loss": -0.011,
+      "num_tokens": 212141952.0,
+      "reward": 1.3834823369979858,
+      "reward_std": 0.16871017217636108,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3834821283817291,
+      "rewards/curriculum_aware_reward_fn/std": 0.4509443938732147,
+      "step": 1810
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 658.0,
+      "completions/max_terminated_length": 658.0,
+      "completions/mean_length": 409.89288330078125,
+      "completions/mean_terminated_length": 409.89288330078125,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.8687129223626515,
+      "grad_norm": 0.8548352122306824,
+      "kl": 0.124267578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0075,
+      "num_tokens": 212256215.0,
+      "reward": 1.5258928537368774,
+      "reward_std": 0.21912138164043427,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.41611576080322266,
+      "step": 1811
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1222.0,
+      "completions/max_terminated_length": 1222.0,
+      "completions/mean_length": 450.4375305175781,
+      "completions/mean_terminated_length": 450.4375305175781,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 1.8697446479236524,
+      "grad_norm": 0.5630614161491394,
+      "kl": 0.089599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 212377609.0,
+      "reward": 1.415178656578064,
+      "reward_std": 0.1340329647064209,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4151785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.4354163110256195,
+      "step": 1812
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1065.0,
+      "completions/max_terminated_length": 1065.0,
+      "completions/mean_length": 484.08038330078125,
+      "completions/mean_terminated_length": 484.08038330078125,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 1.8707763734846532,
+      "grad_norm": 0.6303600668907166,
+      "kl": 0.1007080078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0059,
+      "num_tokens": 212510296.0,
+      "reward": 1.2727680206298828,
+      "reward_std": 0.08729679882526398,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2727678418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.3543734848499298,
+      "step": 1813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 632.0,
+      "completions/max_terminated_length": 632.0,
+      "completions/mean_length": 388.5000305175781,
+      "completions/mean_terminated_length": 388.5000305175781,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.8718080990456538,
+      "grad_norm": 0.951493501663208,
+      "kl": 0.119873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 212613168.0,
+      "reward": 1.5379464626312256,
+      "reward_std": 0.21481995284557343,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5379464030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.5394460558891296,
+      "step": 1814
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 743.0,
+      "completions/max_terminated_length": 743.0,
+      "completions/mean_length": 413.89288330078125,
+      "completions/mean_terminated_length": 413.89288330078125,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 1.8728398246066547,
+      "grad_norm": 0.8855923414230347,
+      "kl": 0.113525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0269,
+      "num_tokens": 212741116.0,
+      "reward": 1.3468750715255737,
+      "reward_std": 0.17701692879199982,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.3718278110027313,
+      "step": 1815
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1193.0,
+      "completions/max_terminated_length": 1193.0,
+      "completions/mean_length": 405.5446472167969,
+      "completions/mean_terminated_length": 405.5446472167969,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 1.8738715501676553,
+      "grad_norm": 0.7841637134552002,
+      "kl": 0.12060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0065,
+      "num_tokens": 212855620.0,
+      "reward": 1.4660714864730835,
+      "reward_std": 0.1883702427148819,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4660714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.4616955816745758,
+      "step": 1816
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 660.0,
+      "completions/max_terminated_length": 660.0,
+      "completions/mean_length": 414.9464416503906,
+      "completions/mean_terminated_length": 414.9464416503906,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 1.8749032757286561,
+      "grad_norm": 0.7373315691947937,
+      "kl": 0.1258544921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0139,
+      "num_tokens": 212972598.0,
+      "reward": 1.3808037042617798,
+      "reward_std": 0.16404660046100616,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38080358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.41812780499458313,
+      "step": 1817
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 884.0,
+      "completions/max_terminated_length": 884.0,
+      "completions/mean_length": 416.71429443359375,
+      "completions/mean_terminated_length": 416.71429443359375,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 1.875935001289657,
+      "grad_norm": 0.747146487236023,
+      "kl": 0.1004638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 213077020.0,
+      "reward": 1.4812500476837158,
+      "reward_std": 0.19955526292324066,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48125001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.45217815041542053,
+      "step": 1818
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 695.0,
+      "completions/max_terminated_length": 695.0,
+      "completions/mean_length": 415.3839416503906,
+      "completions/mean_terminated_length": 415.3839416503906,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.8769667268506578,
+      "grad_norm": 0.8933224081993103,
+      "kl": 0.127685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0162,
+      "num_tokens": 213185899.0,
+      "reward": 1.4156250953674316,
+      "reward_std": 0.176749125123024,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4156250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.4053710997104645,
+      "step": 1819
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1053.0,
+      "completions/max_terminated_length": 1053.0,
+      "completions/mean_length": 455.33929443359375,
+      "completions/mean_terminated_length": 455.33929443359375,
+      "completions/min_length": 133.0,
+      "completions/min_terminated_length": 133.0,
+      "epoch": 1.8779984524116586,
+      "grad_norm": 0.8117207884788513,
+      "kl": 0.1043701171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0135,
+      "num_tokens": 213308066.0,
+      "reward": 1.3950893878936768,
+      "reward_std": 0.18113988637924194,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3950892984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3946123719215393,
+      "step": 1820
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1113.0,
+      "completions/max_terminated_length": 1113.0,
+      "completions/mean_length": 413.1964416503906,
+      "completions/mean_terminated_length": 413.1964416503906,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 1.8790301779726593,
+      "grad_norm": 0.756941556930542,
+      "kl": 0.1058349609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 213422930.0,
+      "reward": 1.5508930683135986,
+      "reward_std": 0.21374936401844025,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5508928894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.4344397783279419,
+      "step": 1821
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 788.0,
+      "completions/max_terminated_length": 788.0,
+      "completions/mean_length": 414.0089416503906,
+      "completions/mean_terminated_length": 414.0089416503906,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 1.88006190353366,
+      "grad_norm": 0.8877094388008118,
+      "kl": 0.114990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 213543086.0,
+      "reward": 1.3727679252624512,
+      "reward_std": 0.17864759266376495,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3727678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.45340994000434875,
+      "step": 1822
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1414.0,
+      "completions/max_terminated_length": 1414.0,
+      "completions/mean_length": 418.51788330078125,
+      "completions/mean_terminated_length": 418.51788330078125,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 1.8810936290946607,
+      "grad_norm": 0.835811972618103,
+      "kl": 0.1126708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0463,
+      "num_tokens": 213651168.0,
+      "reward": 1.4678572416305542,
+      "reward_std": 0.22902914881706238,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46785715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.41961249709129333,
+      "step": 1823
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 629.0,
+      "completions/max_terminated_length": 629.0,
+      "completions/mean_length": 382.5714416503906,
+      "completions/mean_terminated_length": 382.5714416503906,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.8821253546556616,
+      "grad_norm": 0.8936151266098022,
+      "kl": 0.115966796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0001,
+      "num_tokens": 213761405.0,
+      "reward": 1.4495537281036377,
+      "reward_std": 0.20912609994411469,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3940146565437317,
+      "step": 1824
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 651.0,
+      "completions/max_terminated_length": 651.0,
+      "completions/mean_length": 358.3214416503906,
+      "completions/mean_terminated_length": 358.3214416503906,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 1.8831570802166624,
+      "grad_norm": 0.8577347993850708,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.003,
+      "num_tokens": 213861037.0,
+      "reward": 1.4705358743667603,
+      "reward_std": 0.16662080585956573,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4705357253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.45205003023147583,
+      "step": 1825
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 774.0,
+      "completions/max_terminated_length": 774.0,
+      "completions/mean_length": 445.08929443359375,
+      "completions/mean_terminated_length": 445.08929443359375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 1.8841888057776632,
+      "grad_norm": 0.8253490924835205,
+      "kl": 0.1148681640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0003,
+      "num_tokens": 213988223.0,
+      "reward": 1.4049108028411865,
+      "reward_std": 0.3005301058292389,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40491071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4229326844215393,
+      "step": 1826
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1327.0,
+      "completions/max_terminated_length": 1327.0,
+      "completions/mean_length": 392.9107360839844,
+      "completions/mean_terminated_length": 392.9107360839844,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 1.885220531338664,
+      "grad_norm": 0.8072178363800049,
+      "kl": 0.111572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0163,
+      "num_tokens": 214099576.0,
+      "reward": 1.3821429014205933,
+      "reward_std": 0.1937621682882309,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.40790924429893494,
+      "step": 1827
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 828.0,
+      "completions/max_terminated_length": 828.0,
+      "completions/mean_length": 420.2321472167969,
+      "completions/mean_terminated_length": 420.2321472167969,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 1.8862522568996647,
+      "grad_norm": 0.8928146958351135,
+      "kl": 0.1011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0382,
+      "num_tokens": 214223089.0,
+      "reward": 1.352678656578064,
+      "reward_std": 0.1972290426492691,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3616071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.40559056401252747,
+      "step": 1828
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 748.0,
+      "completions/max_terminated_length": 748.0,
+      "completions/mean_length": 381.1071472167969,
+      "completions/mean_terminated_length": 381.1071472167969,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.8872839824606653,
+      "grad_norm": 0.8089170455932617,
+      "kl": 0.11865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0181,
+      "num_tokens": 214333239.0,
+      "reward": 1.3714287281036377,
+      "reward_std": 0.19539318978786469,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37142854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.44039878249168396,
+      "step": 1829
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1148.0,
+      "completions/max_terminated_length": 1148.0,
+      "completions/mean_length": 434.2321472167969,
+      "completions/mean_terminated_length": 434.2321472167969,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.8883157080216662,
+      "grad_norm": 0.8478483557701111,
+      "kl": 0.1148681640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0157,
+      "num_tokens": 214448422.0,
+      "reward": 1.3419644832611084,
+      "reward_std": 0.22933442890644073,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3419642746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.3999185562133789,
+      "step": 1830
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 850.0,
+      "completions/max_terminated_length": 850.0,
+      "completions/mean_length": 387.33929443359375,
+      "completions/mean_terminated_length": 387.33929443359375,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.889347433582667,
+      "grad_norm": 0.9443395733833313,
+      "kl": 0.1378173828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 214553222.0,
+      "reward": 1.5116074085235596,
+      "reward_std": 0.21648088097572327,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5116071105003357,
+      "rewards/curriculum_aware_reward_fn/std": 0.388168603181839,
+      "step": 1831
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 787.0,
+      "completions/max_terminated_length": 787.0,
+      "completions/mean_length": 413.7500305175781,
+      "completions/mean_terminated_length": 413.7500305175781,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 1.8903791591436678,
+      "grad_norm": 0.9532350897789001,
+      "kl": 0.11474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0203,
+      "num_tokens": 214667658.0,
+      "reward": 1.44910728931427,
+      "reward_std": 0.28174877166748047,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4491071403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.3961516320705414,
+      "step": 1832
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 740.0,
+      "completions/max_terminated_length": 740.0,
+      "completions/mean_length": 382.9196472167969,
+      "completions/mean_terminated_length": 382.9196472167969,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.8914108847046687,
+      "grad_norm": 0.7535876631736755,
+      "kl": 0.1142578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0041,
+      "num_tokens": 214768435.0,
+      "reward": 1.3669644594192505,
+      "reward_std": 0.10936643183231354,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36696431040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.43617674708366394,
+      "step": 1833
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 611.0,
+      "completions/max_terminated_length": 611.0,
+      "completions/mean_length": 371.0982360839844,
+      "completions/mean_terminated_length": 371.0982360839844,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.8924426102656693,
+      "grad_norm": 0.8579598665237427,
+      "kl": 0.13427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0134,
+      "num_tokens": 214875140.0,
+      "reward": 1.4062501192092896,
+      "reward_std": 0.22308197617530823,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40625,
+      "rewards/curriculum_aware_reward_fn/std": 0.4337727427482605,
+      "step": 1834
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 823.0,
+      "completions/max_terminated_length": 823.0,
+      "completions/mean_length": 342.02679443359375,
+      "completions/mean_terminated_length": 342.02679443359375,
+      "completions/min_length": 123.0,
+      "completions/min_terminated_length": 123.0,
+      "epoch": 1.8934743358266701,
+      "grad_norm": 2.2421979904174805,
+      "kl": 0.490478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0286,
+      "num_tokens": 214990636.0,
+      "reward": 1.3776785135269165,
+      "reward_std": 0.1712455004453659,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38660717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.3968900740146637,
+      "step": 1835
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 641.0,
+      "completions/max_terminated_length": 641.0,
+      "completions/mean_length": 352.6875305175781,
+      "completions/mean_terminated_length": 352.6875305175781,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 1.8945060613876707,
+      "grad_norm": 0.8646315932273865,
+      "kl": 0.12158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 215093662.0,
+      "reward": 1.5406250953674316,
+      "reward_std": 0.2044602483510971,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5495535731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4226415753364563,
+      "step": 1836
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 385.2946472167969,
+      "completions/mean_terminated_length": 385.2946472167969,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 1.8955377869486716,
+      "grad_norm": 0.8068304657936096,
+      "kl": 0.125244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0347,
+      "num_tokens": 215206269.0,
+      "reward": 1.320089340209961,
+      "reward_std": 0.1882740557193756,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.32901784777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.3777981698513031,
+      "step": 1837
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 945.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 386.5982360839844,
+      "completions/mean_terminated_length": 386.5982360839844,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 1.8965695125096724,
+      "grad_norm": 0.6929387450218201,
+      "kl": 0.1082763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 215314311.0,
+      "reward": 1.5660713911056519,
+      "reward_std": 0.14396119117736816,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5660713911056519,
+      "rewards/curriculum_aware_reward_fn/std": 0.46548500657081604,
+      "step": 1838
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 771.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 384.9464416503906,
+      "completions/mean_terminated_length": 384.9464416503906,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 1.8976012380706733,
+      "grad_norm": 0.7868221402168274,
+      "kl": 0.1160888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0259,
+      "num_tokens": 215437491.0,
+      "reward": 1.4455357789993286,
+      "reward_std": 0.25672388076782227,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45446425676345825,
+      "rewards/curriculum_aware_reward_fn/std": 0.46521106362342834,
+      "step": 1839
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1219.0,
+      "completions/max_terminated_length": 1219.0,
+      "completions/mean_length": 374.0089416503906,
+      "completions/mean_terminated_length": 374.0089416503906,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 1.898632963631674,
+      "grad_norm": 0.8789774775505066,
+      "kl": 0.1220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 215551263.0,
+      "reward": 1.5174108743667603,
+      "reward_std": 0.2568131983280182,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5263392329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.4336602985858917,
+      "step": 1840
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 666.0,
+      "completions/mean_length": 381.8482360839844,
+      "completions/mean_terminated_length": 348.38739013671875,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 1.8996646891926747,
+      "grad_norm": 0.6855369806289673,
+      "kl": 0.1339111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0492,
+      "num_tokens": 215662868.0,
+      "reward": 1.5156251192092896,
+      "reward_std": 0.17239533364772797,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4515842795372009,
+      "step": 1841
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 815.0,
+      "completions/max_terminated_length": 815.0,
+      "completions/mean_length": 356.4375305175781,
+      "completions/mean_terminated_length": 356.4375305175781,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 1.9006964147536756,
+      "grad_norm": 0.827491283416748,
+      "kl": 0.1326904296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0239,
+      "num_tokens": 215771354.0,
+      "reward": 1.434821605682373,
+      "reward_std": 0.21416500210762024,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4348214268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.547674834728241,
+      "step": 1842
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 671.0,
+      "completions/max_terminated_length": 671.0,
+      "completions/mean_length": 362.2857360839844,
+      "completions/mean_terminated_length": 362.2857360839844,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.9017281403146762,
+      "grad_norm": 0.8723616003990173,
+      "kl": 0.120849609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 215879081.0,
+      "reward": 1.509374976158142,
+      "reward_std": 0.2041958123445511,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5093749761581421,
+      "rewards/curriculum_aware_reward_fn/std": 0.42243027687072754,
+      "step": 1843
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 800.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 383.1160888671875,
+      "completions/mean_terminated_length": 383.1160888671875,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 1.902759865875677,
+      "grad_norm": 0.7478457093238831,
+      "kl": 0.119384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 215992333.0,
+      "reward": 1.4169644117355347,
+      "reward_std": 0.23326881229877472,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42589282989501953,
+      "rewards/curriculum_aware_reward_fn/std": 0.45217105746269226,
+      "step": 1844
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 944.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 404.8571472167969,
+      "completions/mean_terminated_length": 404.8571472167969,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 1.9037915914366779,
+      "grad_norm": 0.8745235204696655,
+      "kl": 0.1361083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 216103785.0,
+      "reward": 1.2758928537368774,
+      "reward_std": 0.21973289549350739,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3482986092567444,
+      "step": 1845
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 631.0,
+      "completions/max_terminated_length": 631.0,
+      "completions/mean_length": 362.01788330078125,
+      "completions/mean_terminated_length": 362.01788330078125,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.9048233169976787,
+      "grad_norm": 0.8361862897872925,
+      "kl": 0.1265869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0033,
+      "num_tokens": 216215773.0,
+      "reward": 1.4366072416305542,
+      "reward_std": 0.24635306000709534,
+      "rewards/code_format_reward/mean": 0.9642857313156128,
+      "rewards/code_format_reward/std": 0.18641093373298645,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47232145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.44336140155792236,
+      "step": 1846
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 903.0,
+      "completions/max_terminated_length": 903.0,
+      "completions/mean_length": 357.6250305175781,
+      "completions/mean_terminated_length": 357.6250305175781,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "epoch": 1.9058550425586795,
+      "grad_norm": 0.8441395163536072,
+      "kl": 0.1273193359375,
+      "learning_rate": 1e-06,
+      "loss": -0.022,
+      "num_tokens": 216324245.0,
+      "reward": 1.4339287281036377,
+      "reward_std": 0.28452068567276,
+      "rewards/code_format_reward/mean": 0.9642857313156128,
+      "rewards/code_format_reward/std": 0.18641091883182526,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.4514918625354767,
+      "step": 1847
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 736.0,
+      "completions/max_terminated_length": 736.0,
+      "completions/mean_length": 376.77679443359375,
+      "completions/mean_terminated_length": 376.77679443359375,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.9068867681196802,
+      "grad_norm": 0.9068071842193604,
+      "kl": 0.11669921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0062,
+      "num_tokens": 216434743.0,
+      "reward": 1.4044644832611084,
+      "reward_std": 0.2747339606285095,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42232146859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.42853403091430664,
+      "step": 1848
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 898.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 374.9285888671875,
+      "completions/mean_terminated_length": 374.9285888671875,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 1.9079184936806808,
+      "grad_norm": 0.8925588726997375,
+      "kl": 0.12060546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0087,
+      "num_tokens": 216550270.0,
+      "reward": 1.2790179252624512,
+      "reward_std": 0.20408707857131958,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.296875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3787890374660492,
+      "step": 1849
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 970.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 348.0625305175781,
+      "completions/mean_terminated_length": 348.0625305175781,
+      "completions/min_length": 142.0,
+      "completions/min_terminated_length": 142.0,
+      "epoch": 1.9089502192416816,
+      "grad_norm": 0.9206534624099731,
+      "kl": 0.1251220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0216,
+      "num_tokens": 216657874.0,
+      "reward": 1.3379465341567993,
+      "reward_std": 0.18383868038654327,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35580354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3915058374404907,
+      "step": 1850
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 859.0,
+      "completions/max_terminated_length": 859.0,
+      "completions/mean_length": 365.8482360839844,
+      "completions/mean_terminated_length": 365.8482360839844,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.9099819448026825,
+      "grad_norm": 0.9138371348381042,
+      "kl": 0.116943359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0115,
+      "num_tokens": 216758622.0,
+      "reward": 1.4017857313156128,
+      "reward_std": 0.26434412598609924,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.4232770502567291,
+      "step": 1851
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 854.0,
+      "completions/max_terminated_length": 854.0,
+      "completions/mean_length": 361.1160888671875,
+      "completions/mean_terminated_length": 361.1160888671875,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.9110136703636833,
+      "grad_norm": 0.9135972857475281,
+      "kl": 0.13037109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0204,
+      "num_tokens": 216867989.0,
+      "reward": 1.4691966772079468,
+      "reward_std": 0.2713601589202881,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4959821403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.44235947728157043,
+      "step": 1852
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 522.0,
+      "completions/max_terminated_length": 522.0,
+      "completions/mean_length": 320.90179443359375,
+      "completions/mean_terminated_length": 320.90179443359375,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 1.9120453959246841,
+      "grad_norm": 1.002255916595459,
+      "kl": 0.123291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 216961547.0,
+      "reward": 1.5370535850524902,
+      "reward_std": 0.27375054359436035,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5459821820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.49616265296936035,
+      "step": 1853
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 613.0,
+      "completions/max_terminated_length": 613.0,
+      "completions/mean_length": 346.6964416503906,
+      "completions/mean_terminated_length": 346.6964416503906,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 1.9130771214856848,
+      "grad_norm": 0.9413928389549255,
+      "kl": 0.1341552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0012,
+      "num_tokens": 217071658.0,
+      "reward": 1.391517996788025,
+      "reward_std": 0.24773217737674713,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39151784777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.43365657329559326,
+      "step": 1854
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 577.0,
+      "completions/max_terminated_length": 577.0,
+      "completions/mean_length": 319.1339416503906,
+      "completions/mean_terminated_length": 319.1339416503906,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 1.9141088470466856,
+      "grad_norm": 0.9463006854057312,
+      "kl": 0.1324462890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0048,
+      "num_tokens": 217172347.0,
+      "reward": 1.5656250715255737,
+      "reward_std": 0.23232321441173553,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5745535492897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.4415367841720581,
+      "step": 1855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 667.0,
+      "completions/max_terminated_length": 667.0,
+      "completions/mean_length": 361.3035888671875,
+      "completions/mean_terminated_length": 361.3035888671875,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 1.9151405726076862,
+      "grad_norm": 0.8216485977172852,
+      "kl": 0.1285400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0237,
+      "num_tokens": 217275071.0,
+      "reward": 1.379017949104309,
+      "reward_std": 0.1718110740184784,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3879464268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.3956546485424042,
+      "step": 1856
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 796.0,
+      "completions/max_terminated_length": 796.0,
+      "completions/mean_length": 360.83929443359375,
+      "completions/mean_terminated_length": 360.83929443359375,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 1.916172298168687,
+      "grad_norm": 0.9167246222496033,
+      "kl": 0.1119384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0201,
+      "num_tokens": 217371968.0,
+      "reward": 1.4901785850524902,
+      "reward_std": 0.1831408590078354,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49017858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3973923921585083,
+      "step": 1857
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 569.0,
+      "completions/max_terminated_length": 569.0,
+      "completions/mean_length": 341.6339416503906,
+      "completions/mean_terminated_length": 341.6339416503906,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 1.917204023729688,
+      "grad_norm": 0.8857439160346985,
+      "kl": 0.1221923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0116,
+      "num_tokens": 217477886.0,
+      "reward": 1.5772322416305542,
+      "reward_std": 0.25595399737358093,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5772321820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.5594353675842285,
+      "step": 1858
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 627.0,
+      "completions/max_terminated_length": 627.0,
+      "completions/mean_length": 349.9464416503906,
+      "completions/mean_terminated_length": 349.9464416503906,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 1.9182357492906887,
+      "grad_norm": 0.9459420442581177,
+      "kl": 0.1263427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0211,
+      "num_tokens": 217581256.0,
+      "reward": 1.3508927822113037,
+      "reward_std": 0.1190386638045311,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3508928418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4276471436023712,
+      "step": 1859
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 697.0,
+      "completions/max_terminated_length": 697.0,
+      "completions/mean_length": 358.6339416503906,
+      "completions/mean_terminated_length": 358.6339416503906,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.9192674748516896,
+      "grad_norm": 0.9291829466819763,
+      "kl": 0.1224365234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0183,
+      "num_tokens": 217687454.0,
+      "reward": 1.5843751430511475,
+      "reward_std": 0.2358408272266388,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5843749642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.37901195883750916,
+      "step": 1860
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 961.0,
+      "completions/max_terminated_length": 961.0,
+      "completions/mean_length": 395.1250305175781,
+      "completions/mean_terminated_length": 395.1250305175781,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 1.9202992004126902,
+      "grad_norm": 0.7725751996040344,
+      "kl": 0.104736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0074,
+      "num_tokens": 217811746.0,
+      "reward": 1.4312500953674316,
+      "reward_std": 0.18673662841320038,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4491071403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.4288042187690735,
+      "step": 1861
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 639.0,
+      "completions/max_terminated_length": 639.0,
+      "completions/mean_length": 313.08038330078125,
+      "completions/mean_terminated_length": 313.08038330078125,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 1.921330925973691,
+      "grad_norm": 0.836054801940918,
+      "kl": 0.133544921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0099,
+      "num_tokens": 217908082.0,
+      "reward": 1.7017858028411865,
+      "reward_std": 0.2549782395362854,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7107143402099609,
+      "rewards/curriculum_aware_reward_fn/std": 0.39770904183387756,
+      "step": 1862
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 842.0,
+      "completions/max_terminated_length": 842.0,
+      "completions/mean_length": 384.2232360839844,
+      "completions/mean_terminated_length": 384.2232360839844,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.9223626515346917,
+      "grad_norm": 0.8646076917648315,
+      "kl": 0.1246337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 218020282.0,
+      "reward": 1.3397324085235596,
+      "reward_std": 0.25197386741638184,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3486607074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.388022243976593,
+      "step": 1863
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 753.0,
+      "completions/max_terminated_length": 753.0,
+      "completions/mean_length": 362.3660888671875,
+      "completions/mean_terminated_length": 362.3660888671875,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 1.9233943770956925,
+      "grad_norm": 0.8585340976715088,
+      "kl": 0.121337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0101,
+      "num_tokens": 218131511.0,
+      "reward": 1.6879466772079468,
+      "reward_std": 0.2607966363430023,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6879464387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.36743852496147156,
+      "step": 1864
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 613.0,
+      "completions/max_terminated_length": 613.0,
+      "completions/mean_length": 360.76788330078125,
+      "completions/mean_terminated_length": 360.76788330078125,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 1.9244261026566933,
+      "grad_norm": 0.7554998397827148,
+      "kl": 0.1197509765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0082,
+      "num_tokens": 218241917.0,
+      "reward": 1.380357265472412,
+      "reward_std": 0.15899653732776642,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38928571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.447486937046051,
+      "step": 1865
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 925.0,
+      "completions/max_terminated_length": 925.0,
+      "completions/mean_length": 344.2946472167969,
+      "completions/mean_terminated_length": 344.2946472167969,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 1.9254578282176942,
+      "grad_norm": 0.8225184082984924,
+      "kl": 0.15380859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0085,
+      "num_tokens": 218339964.0,
+      "reward": 1.528571605682373,
+      "reward_std": 0.14810487627983093,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5285714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.40640780329704285,
+      "step": 1866
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 669.0,
+      "completions/max_terminated_length": 669.0,
+      "completions/mean_length": 345.2589416503906,
+      "completions/mean_terminated_length": 345.2589416503906,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 1.926489553778695,
+      "grad_norm": 0.8637830018997192,
+      "kl": 0.113525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 218435718.0,
+      "reward": 1.4928573369979858,
+      "reward_std": 0.2717527449131012,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4928571581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.4726700484752655,
+      "step": 1867
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 654.0,
+      "completions/max_terminated_length": 654.0,
+      "completions/mean_length": 353.9196472167969,
+      "completions/mean_terminated_length": 353.9196472167969,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 1.9275212793396956,
+      "grad_norm": 0.887877881526947,
+      "kl": 0.134033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 218547618.0,
+      "reward": 1.3991073369979858,
+      "reward_std": 0.2046002745628357,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4169642925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.42461180686950684,
+      "step": 1868
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 708.0,
+      "completions/max_terminated_length": 708.0,
+      "completions/mean_length": 366.3214416503906,
+      "completions/mean_terminated_length": 366.3214416503906,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 1.9285530049006963,
+      "grad_norm": 0.8546643853187561,
+      "kl": 0.13525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.025,
+      "num_tokens": 218661201.0,
+      "reward": 1.4066965579986572,
+      "reward_std": 0.2563186585903168,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40669646859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.431657999753952,
+      "step": 1869
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 812.0,
+      "completions/max_terminated_length": 812.0,
+      "completions/mean_length": 393.33929443359375,
+      "completions/mean_terminated_length": 393.33929443359375,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 1.929584730461697,
+      "grad_norm": 0.8663255572319031,
+      "kl": 0.116455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0169,
+      "num_tokens": 218770761.0,
+      "reward": 1.5607144832611084,
+      "reward_std": 0.2684311270713806,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5607143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.4703567922115326,
+      "step": 1870
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 386.4910888671875,
+      "completions/mean_terminated_length": 386.4910888671875,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 1.930616456022698,
+      "grad_norm": 0.8403106331825256,
+      "kl": 0.1114501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 218880758.0,
+      "reward": 1.508928656578064,
+      "reward_std": 0.27292829751968384,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5089285373687744,
+      "rewards/curriculum_aware_reward_fn/std": 0.5379396080970764,
+      "step": 1871
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 560.0,
+      "completions/max_terminated_length": 560.0,
+      "completions/mean_length": 344.1250305175781,
+      "completions/mean_terminated_length": 344.1250305175781,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 1.9316481815836988,
+      "grad_norm": 0.9799405932426453,
+      "kl": 0.1356201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.01,
+      "num_tokens": 218978660.0,
+      "reward": 1.2950893640518188,
+      "reward_std": 0.16900257766246796,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2950892746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.3908848762512207,
+      "step": 1872
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 870.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 384.15179443359375,
+      "completions/mean_terminated_length": 384.15179443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 1.9326799071446996,
+      "grad_norm": 0.8737776875495911,
+      "kl": 0.11572265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0274,
+      "num_tokens": 219086967.0,
+      "reward": 1.368303656578064,
+      "reward_std": 0.1639767438173294,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3683035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.431423157453537,
+      "step": 1873
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 660.0,
+      "completions/max_terminated_length": 660.0,
+      "completions/mean_length": 349.2410888671875,
+      "completions/mean_terminated_length": 349.2410888671875,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 1.9337116327057002,
+      "grad_norm": 0.9632400274276733,
+      "kl": 0.13720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 219193772.0,
+      "reward": 1.5540179014205933,
+      "reward_std": 0.20878908038139343,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5540178418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4543651342391968,
+      "step": 1874
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 968.0,
+      "completions/max_terminated_length": 968.0,
+      "completions/mean_length": 403.5000305175781,
+      "completions/mean_terminated_length": 403.5000305175781,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 1.934743358266701,
+      "grad_norm": 0.6932012438774109,
+      "kl": 0.112060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0101,
+      "num_tokens": 219314522.0,
+      "reward": 1.4258930683135986,
+      "reward_std": 0.15361928939819336,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42589282989501953,
+      "rewards/curriculum_aware_reward_fn/std": 0.4594837427139282,
+      "step": 1875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 873.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 391.3839416503906,
+      "completions/mean_terminated_length": 391.3839416503906,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 1.9357750838277017,
+      "grad_norm": 0.9562661647796631,
+      "kl": 0.1275634765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0157,
+      "num_tokens": 219425620.0,
+      "reward": 1.3147321939468384,
+      "reward_std": 0.23054131865501404,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3147321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.37039291858673096,
+      "step": 1876
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 861.0,
+      "completions/max_terminated_length": 861.0,
+      "completions/mean_length": 416.7321472167969,
+      "completions/mean_terminated_length": 416.7321472167969,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.9368068093887025,
+      "grad_norm": 0.8335247039794922,
+      "kl": 0.1182861328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0149,
+      "num_tokens": 219540887.0,
+      "reward": 1.3441965579986572,
+      "reward_std": 0.25252988934516907,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3441964089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.3944288492202759,
+      "step": 1877
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 863.0,
+      "completions/max_terminated_length": 863.0,
+      "completions/mean_length": 385.89288330078125,
+      "completions/mean_terminated_length": 385.89288330078125,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 1.9378385349497034,
+      "grad_norm": 0.8629086017608643,
+      "kl": 0.1168212890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0315,
+      "num_tokens": 219650754.0,
+      "reward": 1.377678632736206,
+      "reward_std": 0.1759658306837082,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.4390295147895813,
+      "step": 1878
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 624.0,
+      "completions/max_terminated_length": 624.0,
+      "completions/mean_length": 338.3214416503906,
+      "completions/mean_terminated_length": 338.3214416503906,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 1.9388702605107042,
+      "grad_norm": 0.8721398115158081,
+      "kl": 0.12548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0169,
+      "num_tokens": 219751233.0,
+      "reward": 1.4821429252624512,
+      "reward_std": 0.1664353460073471,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4910714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.4412418007850647,
+      "step": 1879
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 704.0,
+      "completions/max_terminated_length": 704.0,
+      "completions/mean_length": 370.21429443359375,
+      "completions/mean_terminated_length": 370.21429443359375,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 1.939901986071705,
+      "grad_norm": 0.8873890042304993,
+      "kl": 0.1141357421875,
+      "learning_rate": 1e-06,
+      "loss": -0.024,
+      "num_tokens": 219851847.0,
+      "reward": 1.4955357313156128,
+      "reward_std": 0.2472870945930481,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4955357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.45458391308784485,
+      "step": 1880
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1015.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 426.7232360839844,
+      "completions/mean_terminated_length": 426.7232360839844,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.9409337116327057,
+      "grad_norm": 0.6784756183624268,
+      "kl": 0.108154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 219969122.0,
+      "reward": 1.3883929252624512,
+      "reward_std": 0.13435271382331848,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3883928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4352167248725891,
+      "step": 1881
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 808.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 393.9107360839844,
+      "completions/mean_terminated_length": 393.9107360839844,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.9419654371937065,
+      "grad_norm": 0.8890510201454163,
+      "kl": 0.1182861328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0259,
+      "num_tokens": 220083674.0,
+      "reward": 1.3825894594192505,
+      "reward_std": 0.20253174006938934,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38258928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.429052472114563,
+      "step": 1882
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 717.0,
+      "completions/max_terminated_length": 717.0,
+      "completions/mean_length": 370.89288330078125,
+      "completions/mean_terminated_length": 370.89288330078125,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.9429971627547071,
+      "grad_norm": 0.7917482256889343,
+      "kl": 0.1241455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0064,
+      "num_tokens": 220191504.0,
+      "reward": 1.5214287042617798,
+      "reward_std": 0.14497151970863342,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5214285850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4005465805530548,
+      "step": 1883
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 948.0,
+      "completions/max_terminated_length": 948.0,
+      "completions/mean_length": 404.6160888671875,
+      "completions/mean_terminated_length": 404.6160888671875,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.944028888315708,
+      "grad_norm": 0.8562974333763123,
+      "kl": 0.13427734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 220300983.0,
+      "reward": 1.3504464626312256,
+      "reward_std": 0.1296975463628769,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.41435563564300537,
+      "step": 1884
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 743.0,
+      "completions/max_terminated_length": 743.0,
+      "completions/mean_length": 408.2946472167969,
+      "completions/mean_terminated_length": 408.2946472167969,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 1.9450606138767088,
+      "grad_norm": 1.1476508378982544,
+      "kl": 0.1634521484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0169,
+      "num_tokens": 220421668.0,
+      "reward": 1.2767857313156128,
+      "reward_std": 0.19308973848819733,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2767857015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.363660603761673,
+      "step": 1885
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 879.0,
+      "completions/max_terminated_length": 879.0,
+      "completions/mean_length": 415.1607360839844,
+      "completions/mean_terminated_length": 415.1607360839844,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.9460923394377097,
+      "grad_norm": 0.7268807888031006,
+      "kl": 0.121337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0144,
+      "num_tokens": 220539909.0,
+      "reward": 1.4200893640518188,
+      "reward_std": 0.18233086168766022,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4200892746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.4332186281681061,
+      "step": 1886
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1001.0,
+      "completions/max_terminated_length": 1001.0,
+      "completions/mean_length": 421.7589416503906,
+      "completions/mean_terminated_length": 421.7589416503906,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 1.9471240649987105,
+      "grad_norm": 0.8596259951591492,
+      "kl": 0.121337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0174,
+      "num_tokens": 220650363.0,
+      "reward": 1.4325894117355347,
+      "reward_std": 0.20996662974357605,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43258926272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.4076802134513855,
+      "step": 1887
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 744.0,
+      "completions/max_terminated_length": 744.0,
+      "completions/mean_length": 403.65179443359375,
+      "completions/mean_terminated_length": 403.65179443359375,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 1.948155790559711,
+      "grad_norm": 0.64628666639328,
+      "kl": 0.1204833984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0041,
+      "num_tokens": 220757648.0,
+      "reward": 1.6053574085235596,
+      "reward_std": 0.09190364181995392,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6053571701049805,
+      "rewards/curriculum_aware_reward_fn/std": 0.42486655712127686,
+      "step": 1888
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 782.0,
+      "completions/max_terminated_length": 782.0,
+      "completions/mean_length": 356.52679443359375,
+      "completions/mean_terminated_length": 356.52679443359375,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 1.9491875161207117,
+      "grad_norm": 0.877808690071106,
+      "kl": 0.1322021484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0202,
+      "num_tokens": 220858108.0,
+      "reward": 1.5473216772079468,
+      "reward_std": 0.15397398173809052,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5473214387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.4423442482948303,
+      "step": 1889
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 784.0,
+      "completions/max_terminated_length": 784.0,
+      "completions/mean_length": 405.08038330078125,
+      "completions/mean_terminated_length": 405.08038330078125,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 1.9502192416817126,
+      "grad_norm": 0.8030251264572144,
+      "kl": 0.1214599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 220969599.0,
+      "reward": 1.581696629524231,
+      "reward_std": 0.27979207038879395,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5816964507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.45152372121810913,
+      "step": 1890
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1016.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 446.8125305175781,
+      "completions/mean_terminated_length": 446.8125305175781,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 1.9512509672427134,
+      "grad_norm": 0.7618669271469116,
+      "kl": 0.112548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0191,
+      "num_tokens": 221093793.0,
+      "reward": 1.5339287519454956,
+      "reward_std": 0.1955319494009018,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5339285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4265519678592682,
+      "step": 1891
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1333.0,
+      "completions/max_terminated_length": 1333.0,
+      "completions/mean_length": 491.08038330078125,
+      "completions/mean_terminated_length": 491.08038330078125,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 1.9522826928037142,
+      "grad_norm": 0.8040339350700378,
+      "kl": 0.108642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.015,
+      "num_tokens": 221218843.0,
+      "reward": 1.4285714626312256,
+      "reward_std": 0.23945212364196777,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4285714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.4297869801521301,
+      "step": 1892
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 746.0,
+      "completions/max_terminated_length": 746.0,
+      "completions/mean_length": 398.5446472167969,
+      "completions/mean_terminated_length": 398.5446472167969,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 1.953314418364715,
+      "grad_norm": 0.6367735266685486,
+      "kl": 0.1279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 221325871.0,
+      "reward": 1.5861608982086182,
+      "reward_std": 0.1092190220952034,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.43481433391571045,
+      "step": 1893
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1197.0,
+      "completions/max_terminated_length": 1197.0,
+      "completions/mean_length": 435.5000305175781,
+      "completions/mean_terminated_length": 435.5000305175781,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "epoch": 1.9543461439257157,
+      "grad_norm": 0.738890528678894,
+      "kl": 0.1275634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0317,
+      "num_tokens": 221447587.0,
+      "reward": 1.4392858743667603,
+      "reward_std": 0.14185898005962372,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4392856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.42930009961128235,
+      "step": 1894
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 854.0,
+      "completions/max_terminated_length": 854.0,
+      "completions/mean_length": 388.0089416503906,
+      "completions/mean_terminated_length": 388.0089416503906,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 1.9553778694867165,
+      "grad_norm": 0.7600980997085571,
+      "kl": 0.123779296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0028,
+      "num_tokens": 221554492.0,
+      "reward": 1.6709821224212646,
+      "reward_std": 0.1413257122039795,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6709821820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.41620543599128723,
+      "step": 1895
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 775.0,
+      "completions/max_terminated_length": 775.0,
+      "completions/mean_length": 437.0625305175781,
+      "completions/mean_terminated_length": 437.0625305175781,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 1.9564095950477172,
+      "grad_norm": 0.7145452499389648,
+      "kl": 0.11181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0277,
+      "num_tokens": 221663636.0,
+      "reward": 1.589285969734192,
+      "reward_std": 0.13859732449054718,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5892857313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4321238398551941,
+      "step": 1896
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2998.0,
+      "completions/max_terminated_length": 2998.0,
+      "completions/mean_length": 515.794677734375,
+      "completions/mean_terminated_length": 515.794677734375,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "epoch": 1.957441320608718,
+      "grad_norm": 0.570237398147583,
+      "kl": 0.1094970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0342,
+      "num_tokens": 221785686.0,
+      "reward": 1.4093750715255737,
+      "reward_std": 0.15295647084712982,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41830354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.45212188363075256,
+      "step": 1897
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 671.0,
+      "completions/max_terminated_length": 671.0,
+      "completions/mean_length": 401.83929443359375,
+      "completions/mean_terminated_length": 401.83929443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 1.9584730461697188,
+      "grad_norm": 0.6868841648101807,
+      "kl": 0.1219482421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0128,
+      "num_tokens": 221896205.0,
+      "reward": 1.5933037996292114,
+      "reward_std": 0.2061375081539154,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5933035612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.4708864092826843,
+      "step": 1898
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1397.0,
+      "completions/max_terminated_length": 1397.0,
+      "completions/mean_length": 465.6875305175781,
+      "completions/mean_terminated_length": 465.6875305175781,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 1.9595047717307197,
+      "grad_norm": 0.7461130619049072,
+      "kl": 0.115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 222008534.0,
+      "reward": 1.2727680206298828,
+      "reward_std": 0.18516550958156586,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2727678418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.39231860637664795,
+      "step": 1899
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1021.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 430.0000305175781,
+      "completions/mean_terminated_length": 430.0000305175781,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 1.9605364972917205,
+      "grad_norm": 0.7491604685783386,
+      "kl": 0.1195068359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0155,
+      "num_tokens": 222123719.0,
+      "reward": 1.4834821224212646,
+      "reward_std": 0.19401638209819794,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48348215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.44597241282463074,
+      "step": 1900
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1003.0,
+      "completions/max_terminated_length": 1003.0,
+      "completions/mean_length": 414.5446472167969,
+      "completions/mean_terminated_length": 414.5446472167969,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 1.9615682228527211,
+      "grad_norm": 0.7671953439712524,
+      "kl": 0.10888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0175,
+      "num_tokens": 222236802.0,
+      "reward": 1.532589316368103,
+      "reward_std": 0.18732120096683502,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.38488972187042236,
+      "step": 1901
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1200.0,
+      "completions/max_terminated_length": 1200.0,
+      "completions/mean_length": 460.8660888671875,
+      "completions/mean_terminated_length": 460.8660888671875,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 1.962599948413722,
+      "grad_norm": 0.7865824699401855,
+      "kl": 0.1195068359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0107,
+      "num_tokens": 222369955.0,
+      "reward": 1.4214287996292114,
+      "reward_std": 0.20716939866542816,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4214286208152771,
+      "rewards/curriculum_aware_reward_fn/std": 0.4383997619152069,
+      "step": 1902
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 836.0,
+      "completions/max_terminated_length": 836.0,
+      "completions/mean_length": 453.3482360839844,
+      "completions/mean_terminated_length": 453.3482360839844,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.9636316739747226,
+      "grad_norm": 0.8720301985740662,
+      "kl": 0.1109619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.018,
+      "num_tokens": 222485613.0,
+      "reward": 1.5343750715255737,
+      "reward_std": 0.2358490377664566,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.534375011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.43599480390548706,
+      "step": 1903
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 957.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 515.4642944335938,
+      "completions/mean_terminated_length": 515.4642944335938,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 1.9646633995357234,
+      "grad_norm": 0.723853349685669,
+      "kl": 0.105224609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0098,
+      "num_tokens": 222620912.0,
+      "reward": 1.378571629524231,
+      "reward_std": 0.25803887844085693,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37857145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.43130403757095337,
+      "step": 1904
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1097.0,
+      "completions/max_terminated_length": 1097.0,
+      "completions/mean_length": 467.4464416503906,
+      "completions/mean_terminated_length": 467.4464416503906,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 1.9656951250967243,
+      "grad_norm": 0.8006047606468201,
+      "kl": 0.1103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0055,
+      "num_tokens": 222734082.0,
+      "reward": 1.364732265472412,
+      "reward_std": 0.20015935599803925,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36473211646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.3692357838153839,
+      "step": 1905
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1091.0,
+      "completions/max_terminated_length": 1091.0,
+      "completions/mean_length": 507.4107360839844,
+      "completions/mean_terminated_length": 507.4107360839844,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 1.9667268506577251,
+      "grad_norm": 0.723853349685669,
+      "kl": 0.1212158203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 222857034.0,
+      "reward": 1.4633928537368774,
+      "reward_std": 0.22907757759094238,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46339288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.4332977831363678,
+      "step": 1906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1025.0,
+      "completions/max_terminated_length": 1025.0,
+      "completions/mean_length": 485.2410888671875,
+      "completions/mean_terminated_length": 485.2410888671875,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 1.967758576218726,
+      "grad_norm": 0.6630465388298035,
+      "kl": 0.1126708984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0124,
+      "num_tokens": 222978490.0,
+      "reward": 1.5424107313156128,
+      "reward_std": 0.2230999618768692,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4499354362487793,
+      "step": 1907
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1182.0,
+      "completions/max_terminated_length": 1182.0,
+      "completions/mean_length": 483.83929443359375,
+      "completions/mean_terminated_length": 483.83929443359375,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 1.9687903017797266,
+      "grad_norm": 0.6432393193244934,
+      "kl": 0.120361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0293,
+      "num_tokens": 223105697.0,
+      "reward": 1.5803571939468384,
+      "reward_std": 0.15816918015480042,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5803571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.411838173866272,
+      "step": 1908
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 839.0,
+      "completions/max_terminated_length": 839.0,
+      "completions/mean_length": 442.3660888671875,
+      "completions/mean_terminated_length": 442.3660888671875,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 1.9698220273407274,
+      "grad_norm": 0.7805054783821106,
+      "kl": 0.1103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.001,
+      "num_tokens": 223216114.0,
+      "reward": 1.4758929014205933,
+      "reward_std": 0.1755753457546234,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4758928418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4153573215007782,
+      "step": 1909
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1045.0,
+      "completions/max_terminated_length": 1045.0,
+      "completions/mean_length": 532.3125,
+      "completions/mean_terminated_length": 532.3125,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 1.970853752901728,
+      "grad_norm": 0.7325468063354492,
+      "kl": 0.10595703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0275,
+      "num_tokens": 223338600.0,
+      "reward": 1.4348214864730835,
+      "reward_std": 0.2412104308605194,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43482139706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.4159456491470337,
+      "step": 1910
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1092.0,
+      "completions/max_terminated_length": 1092.0,
+      "completions/mean_length": 416.1964416503906,
+      "completions/mean_terminated_length": 416.1964416503906,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 1.9718854784627289,
+      "grad_norm": 0.7689169645309448,
+      "kl": 0.12548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0099,
+      "num_tokens": 223443972.0,
+      "reward": 1.6049107313156128,
+      "reward_std": 0.2390371859073639,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6138392090797424,
+      "rewards/curriculum_aware_reward_fn/std": 0.40348759293556213,
+      "step": 1911
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1235.0,
+      "completions/max_terminated_length": 1235.0,
+      "completions/mean_length": 496.7410888671875,
+      "completions/mean_terminated_length": 496.7410888671875,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 1.9729172040237297,
+      "grad_norm": 0.7692927718162537,
+      "kl": 0.11083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0252,
+      "num_tokens": 223573730.0,
+      "reward": 1.349107265472412,
+      "reward_std": 0.23767122626304626,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34910711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.4030839204788208,
+      "step": 1912
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 970.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 500.5982360839844,
+      "completions/mean_terminated_length": 500.5982360839844,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 1.9739489295847306,
+      "grad_norm": 0.8327507376670837,
+      "kl": 0.1146240234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0009,
+      "num_tokens": 223691215.0,
+      "reward": 1.4968750476837158,
+      "reward_std": 0.24476194381713867,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49687498807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.4174289107322693,
+      "step": 1913
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1067.0,
+      "completions/max_terminated_length": 1067.0,
+      "completions/mean_length": 510.8660888671875,
+      "completions/mean_terminated_length": 510.8660888671875,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 1.9749806551457314,
+      "grad_norm": 0.761043131351471,
+      "kl": 0.114501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 223823353.0,
+      "reward": 1.3392857313156128,
+      "reward_std": 0.18880169093608856,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3482142984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.4080788195133209,
+      "step": 1914
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1128.0,
+      "completions/max_terminated_length": 1128.0,
+      "completions/mean_length": 475.3214416503906,
+      "completions/mean_terminated_length": 475.3214416503906,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 1.976012380706732,
+      "grad_norm": 0.7875248789787292,
+      "kl": 0.11572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0121,
+      "num_tokens": 223941633.0,
+      "reward": 1.4611608982086182,
+      "reward_std": 0.1961316615343094,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46116071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.4118027985095978,
+      "step": 1915
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 906.0,
+      "completions/max_terminated_length": 906.0,
+      "completions/mean_length": 507.9732360839844,
+      "completions/mean_terminated_length": 507.9732360839844,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 1.9770441062677326,
+      "grad_norm": 0.723215639591217,
+      "kl": 0.107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 224073559.0,
+      "reward": 1.4205358028411865,
+      "reward_std": 0.1860450804233551,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42053571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4325992166996002,
+      "step": 1916
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 924.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 459.9464416503906,
+      "completions/mean_terminated_length": 459.9464416503906,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 1.9780758318287335,
+      "grad_norm": 0.8373486399650574,
+      "kl": 0.121826171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0095,
+      "num_tokens": 224199749.0,
+      "reward": 1.4727680683135986,
+      "reward_std": 0.32642921805381775,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4727678596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.5004002451896667,
+      "step": 1917
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 832.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 458.08038330078125,
+      "completions/mean_terminated_length": 458.08038330078125,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 1.9791075573897343,
+      "grad_norm": 0.7150551676750183,
+      "kl": 0.1224365234375,
+      "learning_rate": 1e-06,
+      "loss": 0.025,
+      "num_tokens": 224321537.0,
+      "reward": 1.579017996788025,
+      "reward_std": 0.1882287859916687,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5790178179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.3917297124862671,
+      "step": 1918
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 902.0,
+      "completions/max_terminated_length": 902.0,
+      "completions/mean_length": 433.15179443359375,
+      "completions/mean_terminated_length": 433.15179443359375,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.9801392829507352,
+      "grad_norm": 0.6942814588546753,
+      "kl": 0.1180419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0246,
+      "num_tokens": 224430936.0,
+      "reward": 1.6250001192092896,
+      "reward_std": 0.15352478623390198,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.625,
+      "rewards/curriculum_aware_reward_fn/std": 0.4262765645980835,
+      "step": 1919
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 899.0,
+      "completions/max_terminated_length": 899.0,
+      "completions/mean_length": 477.8839416503906,
+      "completions/mean_terminated_length": 477.8839416503906,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 1.981171008511736,
+      "grad_norm": 0.7555545568466187,
+      "kl": 0.1136474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 224546801.0,
+      "reward": 1.4424108266830444,
+      "reward_std": 0.23390239477157593,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4513393044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.46204888820648193,
+      "step": 1920
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 904.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 485.95538330078125,
+      "completions/mean_terminated_length": 485.95538330078125,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 1.9822027340727366,
+      "grad_norm": 0.8747866153717041,
+      "kl": 0.118408203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0209,
+      "num_tokens": 224676212.0,
+      "reward": 1.4625002145767212,
+      "reward_std": 0.28620848059654236,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4625000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.39229631423950195,
+      "step": 1921
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 855.0,
+      "completions/max_terminated_length": 855.0,
+      "completions/mean_length": 447.0535888671875,
+      "completions/mean_terminated_length": 447.0535888671875,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 1.9832344596337375,
+      "grad_norm": 0.6834079027175903,
+      "kl": 0.1190185546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0074,
+      "num_tokens": 224787147.0,
+      "reward": 1.5656250715255737,
+      "reward_std": 0.21303489804267883,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.565625011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.4493253827095032,
+      "step": 1922
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 999.0,
+      "completions/max_terminated_length": 999.0,
+      "completions/mean_length": 457.169677734375,
+      "completions/mean_terminated_length": 457.169677734375,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 1.984266185194738,
+      "grad_norm": 0.6467311978340149,
+      "kl": 0.111572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0097,
+      "num_tokens": 224904654.0,
+      "reward": 1.4964287281036377,
+      "reward_std": 0.21329189836978912,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5053571462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.43719813227653503,
+      "step": 1923
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1199.0,
+      "completions/max_terminated_length": 1199.0,
+      "completions/mean_length": 493.107177734375,
+      "completions/mean_terminated_length": 493.107177734375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 1.985297910755739,
+      "grad_norm": 0.7250528931617737,
+      "kl": 0.1156005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 225029518.0,
+      "reward": 1.4723213911056519,
+      "reward_std": 0.19657452404499054,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47232145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.41202855110168457,
+      "step": 1924
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1174.0,
+      "completions/max_terminated_length": 1174.0,
+      "completions/mean_length": 473.6607360839844,
+      "completions/mean_terminated_length": 473.6607360839844,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 1.9863296363167398,
+      "grad_norm": 0.6025116443634033,
+      "kl": 0.103271484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0156,
+      "num_tokens": 225151364.0,
+      "reward": 1.463392972946167,
+      "reward_std": 0.17400220036506653,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46339288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.458060085773468,
+      "step": 1925
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 926.0,
+      "completions/max_terminated_length": 926.0,
+      "completions/mean_length": 484.9107360839844,
+      "completions/mean_terminated_length": 484.9107360839844,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 1.9873613618777406,
+      "grad_norm": 0.7289581298828125,
+      "kl": 0.116943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 225285775.0,
+      "reward": 1.4758929014205933,
+      "reward_std": 0.2225145697593689,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4758928120136261,
+      "rewards/curriculum_aware_reward_fn/std": 0.43376532196998596,
+      "step": 1926
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1150.0,
+      "completions/max_terminated_length": 1150.0,
+      "completions/mean_length": 475.2589416503906,
+      "completions/mean_terminated_length": 475.2589416503906,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 1.9883930874387414,
+      "grad_norm": 0.7278993129730225,
+      "kl": 0.1214599609375,
+      "learning_rate": 1e-06,
+      "loss": -0.023,
+      "num_tokens": 225401234.0,
+      "reward": 1.3933037519454956,
+      "reward_std": 0.13688452541828156,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3933035731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.3881984055042267,
+      "step": 1927
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1149.0,
+      "completions/max_terminated_length": 1149.0,
+      "completions/mean_length": 492.3660888671875,
+      "completions/mean_terminated_length": 492.3660888671875,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 1.989424812999742,
+      "grad_norm": 0.6222229599952698,
+      "kl": 0.1170654296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 225519750.0,
+      "reward": 1.5058035850524902,
+      "reward_std": 0.10875215381383896,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5058035850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4427974820137024,
+      "step": 1928
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 877.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 475.7410888671875,
+      "completions/mean_terminated_length": 475.7410888671875,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 1.990456538560743,
+      "grad_norm": 0.6316165328025818,
+      "kl": 0.114990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 225633576.0,
+      "reward": 1.466071605682373,
+      "reward_std": 0.14979317784309387,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4660714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.44429388642311096,
+      "step": 1929
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 823.0,
+      "completions/max_terminated_length": 823.0,
+      "completions/mean_length": 471.9732360839844,
+      "completions/mean_terminated_length": 471.9732360839844,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 1.9914882641217435,
+      "grad_norm": 0.9416745901107788,
+      "kl": 0.1495361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0066,
+      "num_tokens": 225761036.0,
+      "reward": 1.4834821224212646,
+      "reward_std": 0.23354566097259521,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48348215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.41413038969039917,
+      "step": 1930
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 978.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 510.3839416503906,
+      "completions/mean_terminated_length": 510.3839416503906,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 1.9925199896827444,
+      "grad_norm": 0.7914566993713379,
+      "kl": 0.1246337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 225880345.0,
+      "reward": 1.4580358266830444,
+      "reward_std": 0.2331819087266922,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4669643044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.4202934205532074,
+      "step": 1931
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 960.0,
+      "completions/max_terminated_length": 960.0,
+      "completions/mean_length": 454.5714416503906,
+      "completions/mean_terminated_length": 454.5714416503906,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 1.9935517152437452,
+      "grad_norm": 0.7780898213386536,
+      "kl": 0.114990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0304,
+      "num_tokens": 225991891.0,
+      "reward": 1.440178632736206,
+      "reward_std": 0.21832521259784698,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4401785433292389,
+      "rewards/curriculum_aware_reward_fn/std": 0.43463972210884094,
+      "step": 1932
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1073.0,
+      "completions/max_terminated_length": 1073.0,
+      "completions/mean_length": 484.6160888671875,
+      "completions/mean_terminated_length": 484.6160888671875,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 1.994583440804746,
+      "grad_norm": 0.7472946047782898,
+      "kl": 0.1243896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 226113494.0,
+      "reward": 1.493303656578064,
+      "reward_std": 0.17449244856834412,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4933035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.44089874625205994,
+      "step": 1933
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 846.0,
+      "completions/max_terminated_length": 846.0,
+      "completions/mean_length": 435.7321472167969,
+      "completions/mean_terminated_length": 435.7321472167969,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 1.9956151663657469,
+      "grad_norm": 0.8039238452911377,
+      "kl": 0.12939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0183,
+      "num_tokens": 226225940.0,
+      "reward": 1.471428632736206,
+      "reward_std": 0.296146035194397,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4306769371032715,
+      "step": 1934
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1407.0,
+      "completions/max_terminated_length": 1407.0,
+      "completions/mean_length": 511.5535888671875,
+      "completions/mean_terminated_length": 511.5535888671875,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 1.9966468919267475,
+      "grad_norm": 0.6034707427024841,
+      "kl": 0.1014404296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0195,
+      "num_tokens": 226352324.0,
+      "reward": 1.3870537281036377,
+      "reward_std": 0.13577203452587128,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38705354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.44091150164604187,
+      "step": 1935
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 999.0,
+      "completions/max_terminated_length": 999.0,
+      "completions/mean_length": 481.0357360839844,
+      "completions/mean_terminated_length": 481.0357360839844,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 1.997678617487748,
+      "grad_norm": 0.6917321085929871,
+      "kl": 0.117919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.015,
+      "num_tokens": 226477145.0,
+      "reward": 1.6187500953674316,
+      "reward_std": 0.21138519048690796,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6187500357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.40886160731315613,
+      "step": 1936
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1279.0,
+      "completions/max_terminated_length": 1279.0,
+      "completions/mean_length": 517.0892944335938,
+      "completions/mean_terminated_length": 517.0892944335938,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 1.998710343048749,
+      "grad_norm": 0.6973819732666016,
+      "kl": 0.12451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0221,
+      "num_tokens": 226607495.0,
+      "reward": 1.3973214626312256,
+      "reward_std": 0.19465167820453644,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3973214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.43287426233291626,
+      "step": 1937
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1028.0,
+      "completions/max_terminated_length": 1028.0,
+      "completions/mean_length": 586.7000122070312,
+      "completions/mean_terminated_length": 586.7000122070312,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 1.9997420686097498,
+      "grad_norm": 0.7009508609771729,
+      "kl": 0.1156005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.007,
+      "num_tokens": 226742568.0,
+      "reward": 1.4357143640518188,
+      "reward_std": 0.2279166579246521,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4357142746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.42274460196495056,
+      "step": 1938
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1184.0,
+      "completions/max_terminated_length": 1184.0,
+      "completions/mean_length": 512.6428833007812,
+      "completions/mean_terminated_length": 512.6428833007812,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 2.001031725561001,
+      "grad_norm": 0.6595158576965332,
+      "kl": 0.1156005859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0156,
+      "num_tokens": 226869480.0,
+      "reward": 1.5250000953674316,
+      "reward_std": 0.2136206030845642,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5249999761581421,
+      "rewards/curriculum_aware_reward_fn/std": 0.4552001953125,
+      "step": 1939
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1072.0,
+      "completions/max_terminated_length": 1072.0,
+      "completions/mean_length": 501.294677734375,
+      "completions/mean_terminated_length": 501.294677734375,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.0020634511220017,
+      "grad_norm": 0.7600402235984802,
+      "kl": 0.117919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 226996360.0,
+      "reward": 1.4607144594192505,
+      "reward_std": 0.19837914407253265,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46071428060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.4279339015483856,
+      "step": 1940
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1133.0,
+      "completions/max_terminated_length": 1133.0,
+      "completions/mean_length": 491.2500305175781,
+      "completions/mean_terminated_length": 491.2500305175781,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 2.0030951766830025,
+      "grad_norm": 0.6804510354995728,
+      "kl": 0.1165771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 227117908.0,
+      "reward": 1.6517857313156128,
+      "reward_std": 0.15756270289421082,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6517857313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4126342833042145,
+      "step": 1941
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1030.0,
+      "completions/max_terminated_length": 1030.0,
+      "completions/mean_length": 457.83929443359375,
+      "completions/mean_terminated_length": 457.83929443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.004126902244003,
+      "grad_norm": 0.7952170372009277,
+      "kl": 0.1226806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0389,
+      "num_tokens": 227227538.0,
+      "reward": 1.5852677822113037,
+      "reward_std": 0.1368647962808609,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5852679014205933,
+      "rewards/curriculum_aware_reward_fn/std": 0.430044949054718,
+      "step": 1942
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1116.0,
+      "completions/max_terminated_length": 1116.0,
+      "completions/mean_length": 480.794677734375,
+      "completions/mean_terminated_length": 480.794677734375,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.0051586278050038,
+      "grad_norm": 0.7373249530792236,
+      "kl": 0.12255859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0114,
+      "num_tokens": 227346823.0,
+      "reward": 1.4571430683135986,
+      "reward_std": 0.2077142298221588,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4660714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.4125407040119171,
+      "step": 1943
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 512.5267944335938,
+      "completions/mean_terminated_length": 512.5267944335938,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 2.0061903533660046,
+      "grad_norm": 0.7481683492660522,
+      "kl": 0.11279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.029,
+      "num_tokens": 227473050.0,
+      "reward": 1.4656251668930054,
+      "reward_std": 0.2195693999528885,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.4202118217945099,
+      "step": 1944
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 983.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 476.3750305175781,
+      "completions/mean_terminated_length": 476.3750305175781,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 2.0072220789270054,
+      "grad_norm": 0.7075243592262268,
+      "kl": 0.110107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0045,
+      "num_tokens": 227591231.0,
+      "reward": 1.5468751192092896,
+      "reward_std": 0.22008267045021057,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5468749403953552,
+      "rewards/curriculum_aware_reward_fn/std": 0.41515660285949707,
+      "step": 1945
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1507.0,
+      "completions/max_terminated_length": 1507.0,
+      "completions/mean_length": 458.6339416503906,
+      "completions/mean_terminated_length": 458.6339416503906,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 2.0082538044880063,
+      "grad_norm": 0.6288896799087524,
+      "kl": 0.1248779296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 227709882.0,
+      "reward": 1.6352678537368774,
+      "reward_std": 0.11739077419042587,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6352678537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.42983540892601013,
+      "step": 1946
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3775.0,
+      "completions/max_terminated_length": 3775.0,
+      "completions/mean_length": 556.3839721679688,
+      "completions/mean_terminated_length": 556.3839721679688,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.009285530049007,
+      "grad_norm": 0.6854794025421143,
+      "kl": 0.10791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0265,
+      "num_tokens": 227840167.0,
+      "reward": 1.472321629524231,
+      "reward_std": 0.22720672190189362,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47232145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.40945136547088623,
+      "step": 1947
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 970.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 480.4732360839844,
+      "completions/mean_terminated_length": 480.4732360839844,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 2.010317255610008,
+      "grad_norm": 0.7467259764671326,
+      "kl": 0.1180419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 227969972.0,
+      "reward": 1.458482265472412,
+      "reward_std": 0.2527380883693695,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45848211646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.4306337237358093,
+      "step": 1948
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1184.0,
+      "completions/max_terminated_length": 1184.0,
+      "completions/mean_length": 477.83929443359375,
+      "completions/mean_terminated_length": 477.83929443359375,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 2.0113489811710084,
+      "grad_norm": 0.7228710651397705,
+      "kl": 0.10400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0061,
+      "num_tokens": 228084089.0,
+      "reward": 1.5075894594192505,
+      "reward_std": 0.15247538685798645,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5075892806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.4195777475833893,
+      "step": 1949
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1355.0,
+      "completions/max_terminated_length": 1355.0,
+      "completions/mean_length": 481.0625305175781,
+      "completions/mean_terminated_length": 481.0625305175781,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 2.012380706732009,
+      "grad_norm": 0.8138713836669922,
+      "kl": 0.1185302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0018,
+      "num_tokens": 228207065.0,
+      "reward": 1.489285945892334,
+      "reward_std": 0.19530774652957916,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4892857074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.39298057556152344,
+      "step": 1950
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 992.0,
+      "completions/max_terminated_length": 992.0,
+      "completions/mean_length": 481.5089416503906,
+      "completions/mean_terminated_length": 481.5089416503906,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 2.01341243229301,
+      "grad_norm": 0.7092810273170471,
+      "kl": 0.1160888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0208,
+      "num_tokens": 228322559.0,
+      "reward": 1.519196629524231,
+      "reward_std": 0.20195798575878143,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.528124988079071,
+      "rewards/curriculum_aware_reward_fn/std": 0.45046430826187134,
+      "step": 1951
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 820.0,
+      "completions/max_terminated_length": 820.0,
+      "completions/mean_length": 433.2589416503906,
+      "completions/mean_terminated_length": 433.2589416503906,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 2.014444157854011,
+      "grad_norm": 0.679499626159668,
+      "kl": 0.130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 228432418.0,
+      "reward": 1.6607143878936768,
+      "reward_std": 0.16779665648937225,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6607142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.41298893094062805,
+      "step": 1952
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1436.0,
+      "completions/max_terminated_length": 1436.0,
+      "completions/mean_length": 497.0982360839844,
+      "completions/mean_terminated_length": 497.0982360839844,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 2.0154758834150117,
+      "grad_norm": 0.7596502304077148,
+      "kl": 0.13232421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0314,
+      "num_tokens": 228550869.0,
+      "reward": 1.43348228931427,
+      "reward_std": 0.14467096328735352,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43348217010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.5326877236366272,
+      "step": 1953
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1593.0,
+      "completions/max_terminated_length": 1593.0,
+      "completions/mean_length": 538.9910888671875,
+      "completions/mean_terminated_length": 538.9910888671875,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 2.0165076089760126,
+      "grad_norm": 0.6555126905441284,
+      "kl": 0.1097412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0132,
+      "num_tokens": 228693178.0,
+      "reward": 1.4982144832611084,
+      "reward_std": 0.17982201278209686,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.5427212119102478,
+      "step": 1954
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 896.0,
+      "completions/max_terminated_length": 896.0,
+      "completions/mean_length": 456.76788330078125,
+      "completions/mean_terminated_length": 456.76788330078125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 2.017539334537013,
+      "grad_norm": 0.8210303783416748,
+      "kl": 0.118408203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0081,
+      "num_tokens": 228805950.0,
+      "reward": 1.419196605682373,
+      "reward_std": 0.19702807068824768,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41919639706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.3827499747276306,
+      "step": 1955
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 971.0,
+      "completions/max_terminated_length": 971.0,
+      "completions/mean_length": 457.7410888671875,
+      "completions/mean_terminated_length": 457.7410888671875,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 2.018571060098014,
+      "grad_norm": 0.6348860859870911,
+      "kl": 0.1182861328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0058,
+      "num_tokens": 228925901.0,
+      "reward": 1.4950894117355347,
+      "reward_std": 0.1397673785686493,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4950893223285675,
+      "rewards/curriculum_aware_reward_fn/std": 0.4401044249534607,
+      "step": 1956
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 967.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 469.6160888671875,
+      "completions/mean_terminated_length": 469.6160888671875,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 2.0196027856590146,
+      "grad_norm": 0.6954073905944824,
+      "kl": 0.1123046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0043,
+      "num_tokens": 229040315.0,
+      "reward": 1.3888394832611084,
+      "reward_std": 0.15426510572433472,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3888392746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.39879944920539856,
+      "step": 1957
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1366.0,
+      "completions/max_terminated_length": 1366.0,
+      "completions/mean_length": 514.9375,
+      "completions/mean_terminated_length": 514.9375,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 2.0206345112200155,
+      "grad_norm": 0.7137829065322876,
+      "kl": 0.11083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0218,
+      "num_tokens": 229172391.0,
+      "reward": 1.4209822416305542,
+      "reward_std": 0.2412976324558258,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42098215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.4099350869655609,
+      "step": 1958
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 951.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 466.9285888671875,
+      "completions/mean_terminated_length": 466.9285888671875,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 2.0216662367810163,
+      "grad_norm": 0.6483295559883118,
+      "kl": 0.1134033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 229287953.0,
+      "reward": 1.516964316368103,
+      "reward_std": 0.17437413334846497,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.516964316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.42762458324432373,
+      "step": 1959
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 796.0,
+      "completions/max_terminated_length": 796.0,
+      "completions/mean_length": 383.08038330078125,
+      "completions/mean_terminated_length": 383.08038330078125,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 2.022697962342017,
+      "grad_norm": 0.8443179726600647,
+      "kl": 0.1192626953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0185,
+      "num_tokens": 229382812.0,
+      "reward": 1.5861607789993286,
+      "reward_std": 0.18597619235515594,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.43434789776802063,
+      "step": 1960
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1641.0,
+      "completions/max_terminated_length": 1641.0,
+      "completions/mean_length": 518.375,
+      "completions/mean_terminated_length": 518.375,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 2.023729687903018,
+      "grad_norm": 0.6534212827682495,
+      "kl": 0.0992431640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0172,
+      "num_tokens": 229510891.0,
+      "reward": 1.4928573369979858,
+      "reward_std": 0.1748374104499817,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4928571581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.45231422781944275,
+      "step": 1961
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1637.0,
+      "completions/max_terminated_length": 1637.0,
+      "completions/mean_length": 496.2232360839844,
+      "completions/mean_terminated_length": 496.2232360839844,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.0247614134640184,
+      "grad_norm": 0.6812605857849121,
+      "kl": 0.109619140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0112,
+      "num_tokens": 229633220.0,
+      "reward": 1.4276787042617798,
+      "reward_std": 0.17530037462711334,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.39585912227630615,
+      "step": 1962
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1231.0,
+      "completions/max_terminated_length": 1231.0,
+      "completions/mean_length": 505.1785888671875,
+      "completions/mean_terminated_length": 505.1785888671875,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 2.0257931390250192,
+      "grad_norm": 0.6694918870925903,
+      "kl": 0.1153564453125,
+      "learning_rate": 1e-06,
+      "loss": -0.012,
+      "num_tokens": 229768067.0,
+      "reward": 1.3674107789993286,
+      "reward_std": 0.17926202714443207,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.41163864731788635,
+      "step": 1963
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 884.0,
+      "completions/max_terminated_length": 884.0,
+      "completions/mean_length": 391.4464416503906,
+      "completions/mean_terminated_length": 391.4464416503906,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 2.02682486458602,
+      "grad_norm": 0.8477822542190552,
+      "kl": 0.1187744140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0256,
+      "num_tokens": 229870009.0,
+      "reward": 1.5107144117355347,
+      "reward_std": 0.12055130302906036,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5107142925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.4163563549518585,
+      "step": 1964
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1400.0,
+      "completions/max_terminated_length": 1400.0,
+      "completions/mean_length": 458.51788330078125,
+      "completions/mean_terminated_length": 458.51788330078125,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 2.027856590147021,
+      "grad_norm": 0.7395011782646179,
+      "kl": 0.1177978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 229999322.0,
+      "reward": 1.4848215579986572,
+      "reward_std": 0.254284143447876,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4848214089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.44599515199661255,
+      "step": 1965
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3908.0,
+      "completions/max_terminated_length": 3908.0,
+      "completions/mean_length": 487.5357360839844,
+      "completions/mean_terminated_length": 487.5357360839844,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 2.0288883157080218,
+      "grad_norm": 0.7274074554443359,
+      "kl": 0.1099853515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0304,
+      "num_tokens": 230124360.0,
+      "reward": 1.4107143878936768,
+      "reward_std": 0.19167140126228333,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3991222679615021,
+      "step": 1966
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 798.0,
+      "completions/max_terminated_length": 798.0,
+      "completions/mean_length": 455.857177734375,
+      "completions/mean_terminated_length": 455.857177734375,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 2.0299200412690226,
+      "grad_norm": 0.776483416557312,
+      "kl": 0.11767578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0125,
+      "num_tokens": 230240411.0,
+      "reward": 1.6031250953674316,
+      "reward_std": 0.21919363737106323,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6031250357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.41423332691192627,
+      "step": 1967
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 729.0,
+      "completions/max_terminated_length": 729.0,
+      "completions/mean_length": 410.95538330078125,
+      "completions/mean_terminated_length": 410.95538330078125,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 2.0309517668300234,
+      "grad_norm": 0.8736270666122437,
+      "kl": 0.1171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0058,
+      "num_tokens": 230345553.0,
+      "reward": 1.5915179252624512,
+      "reward_std": 0.28784704208374023,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5915178656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.42116785049438477,
+      "step": 1968
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 854.0,
+      "completions/max_terminated_length": 854.0,
+      "completions/mean_length": 472.46429443359375,
+      "completions/mean_terminated_length": 472.46429443359375,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 2.031983492391024,
+      "grad_norm": 0.7397417426109314,
+      "kl": 0.120849609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0099,
+      "num_tokens": 230474776.0,
+      "reward": 1.4187500476837158,
+      "reward_std": 0.18822595477104187,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41875001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.39763516187667847,
+      "step": 1969
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 966.0,
+      "completions/max_terminated_length": 966.0,
+      "completions/mean_length": 442.02679443359375,
+      "completions/mean_terminated_length": 442.02679443359375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.0330152179520247,
+      "grad_norm": 0.6692867279052734,
+      "kl": 0.1123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0179,
+      "num_tokens": 230596495.0,
+      "reward": 1.430803656578064,
+      "reward_std": 0.19043758511543274,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4308035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.46688762307167053,
+      "step": 1970
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 908.0,
+      "completions/mean_length": 447.08038330078125,
+      "completions/mean_terminated_length": 414.20721435546875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.0340469435130255,
+      "grad_norm": 0.8106693029403687,
+      "kl": 0.11962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0231,
+      "num_tokens": 230709484.0,
+      "reward": 1.614732265472412,
+      "reward_std": 0.18752601742744446,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6147322058677673,
+      "rewards/curriculum_aware_reward_fn/std": 0.430044949054718,
+      "step": 1971
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 694.0,
+      "completions/max_terminated_length": 694.0,
+      "completions/mean_length": 402.95538330078125,
+      "completions/mean_terminated_length": 402.95538330078125,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 2.0350786690740263,
+      "grad_norm": 0.7740663886070251,
+      "kl": 0.1324462890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0214,
+      "num_tokens": 230830439.0,
+      "reward": 1.4558035135269165,
+      "reward_std": 0.2036939114332199,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45580360293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.4446248412132263,
+      "step": 1972
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 778.0,
+      "completions/max_terminated_length": 778.0,
+      "completions/mean_length": 399.6964416503906,
+      "completions/mean_terminated_length": 399.6964416503906,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 2.036110394635027,
+      "grad_norm": 0.6959367990493774,
+      "kl": 0.111572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 230939528.0,
+      "reward": 1.5232144594192505,
+      "reward_std": 0.14704690873622894,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5232142806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.4357311427593231,
+      "step": 1973
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1014.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 429.8214416503906,
+      "completions/mean_terminated_length": 429.8214416503906,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.037142120196028,
+      "grad_norm": 0.8482027053833008,
+      "kl": 0.128173828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 231052964.0,
+      "reward": 1.505357265472412,
+      "reward_std": 0.23154647648334503,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5053571462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.4438953697681427,
+      "step": 1974
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1496.0,
+      "completions/max_terminated_length": 1496.0,
+      "completions/mean_length": 463.5625305175781,
+      "completions/mean_terminated_length": 463.5625305175781,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 2.0381738457570284,
+      "grad_norm": 0.7355022430419922,
+      "kl": 0.110595703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0151,
+      "num_tokens": 231172748.0,
+      "reward": 1.473660945892334,
+      "reward_std": 0.19264405965805054,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4736607074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.44117775559425354,
+      "step": 1975
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1138.0,
+      "completions/max_terminated_length": 1138.0,
+      "completions/mean_length": 461.232177734375,
+      "completions/mean_terminated_length": 461.232177734375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.0392055713180293,
+      "grad_norm": 0.7399516701698303,
+      "kl": 0.121337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0018,
+      "num_tokens": 231292131.0,
+      "reward": 1.500892996788025,
+      "reward_std": 0.1874261349439621,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5008928179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.4057571291923523,
+      "step": 1976
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 759.0,
+      "completions/max_terminated_length": 759.0,
+      "completions/mean_length": 430.14288330078125,
+      "completions/mean_terminated_length": 430.14288330078125,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.04023729687903,
+      "grad_norm": 0.7026528716087341,
+      "kl": 0.116455078125,
+      "learning_rate": 1e-06,
+      "loss": -0.002,
+      "num_tokens": 231408900.0,
+      "reward": 1.4950892925262451,
+      "reward_std": 0.23565950989723206,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5040178894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.417475163936615,
+      "step": 1977
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 841.0,
+      "completions/max_terminated_length": 841.0,
+      "completions/mean_length": 386.2946472167969,
+      "completions/mean_terminated_length": 386.2946472167969,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.041269022440031,
+      "grad_norm": 0.6979001760482788,
+      "kl": 0.1278076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0013,
+      "num_tokens": 231505288.0,
+      "reward": 1.579017996788025,
+      "reward_std": 0.15396980941295624,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5790178179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.4313000738620758,
+      "step": 1978
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 418.6607360839844,
+      "completions/mean_terminated_length": 418.6607360839844,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "epoch": 2.042300748001032,
+      "grad_norm": 0.623936653137207,
+      "kl": 0.106201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0035,
+      "num_tokens": 231611468.0,
+      "reward": 1.4375001192092896,
+      "reward_std": 0.10704384744167328,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4375,
+      "rewards/curriculum_aware_reward_fn/std": 0.4572489261627197,
+      "step": 1979
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 864.0,
+      "completions/max_terminated_length": 864.0,
+      "completions/mean_length": 438.3660888671875,
+      "completions/mean_terminated_length": 438.3660888671875,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 2.0433324735620326,
+      "grad_norm": 0.8500005006790161,
+      "kl": 0.1148681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0131,
+      "num_tokens": 231733932.0,
+      "reward": 1.4482144117355347,
+      "reward_std": 0.20263758301734924,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4482142925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.4057541489601135,
+      "step": 1980
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 941.0,
+      "completions/max_terminated_length": 941.0,
+      "completions/mean_length": 435.58929443359375,
+      "completions/mean_terminated_length": 435.58929443359375,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 2.0443641991230335,
+      "grad_norm": 0.8085100054740906,
+      "kl": 0.11767578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0027,
+      "num_tokens": 231847104.0,
+      "reward": 1.3821427822113037,
+      "reward_std": 0.1722569614648819,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.40719079971313477,
+      "step": 1981
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 988.0,
+      "completions/max_terminated_length": 988.0,
+      "completions/mean_length": 440.0625305175781,
+      "completions/mean_terminated_length": 440.0625305175781,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 2.045395924684034,
+      "grad_norm": 0.8003664612770081,
+      "kl": 0.1328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0094,
+      "num_tokens": 231971060.0,
+      "reward": 1.4665179252624512,
+      "reward_std": 0.2330600619316101,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4665178656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.43689054250717163,
+      "step": 1982
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1208.0,
+      "completions/max_terminated_length": 1208.0,
+      "completions/mean_length": 459.77679443359375,
+      "completions/mean_terminated_length": 459.77679443359375,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 2.0464276502450347,
+      "grad_norm": 0.877363383769989,
+      "kl": 0.127685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 232103818.0,
+      "reward": 1.516517996788025,
+      "reward_std": 0.22694124281406403,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5165178179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.3971967399120331,
+      "step": 1983
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1097.0,
+      "completions/max_terminated_length": 1097.0,
+      "completions/mean_length": 454.58038330078125,
+      "completions/mean_terminated_length": 454.58038330078125,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 2.0474593758060355,
+      "grad_norm": 0.721016526222229,
+      "kl": 0.1104736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 232211627.0,
+      "reward": 1.4062501192092896,
+      "reward_std": 0.17761258780956268,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4151785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.4367591440677643,
+      "step": 1984
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1186.0,
+      "completions/max_terminated_length": 1186.0,
+      "completions/mean_length": 427.1250305175781,
+      "completions/mean_terminated_length": 427.1250305175781,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.0484911013670364,
+      "grad_norm": 0.8724990487098694,
+      "kl": 0.125244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0197,
+      "num_tokens": 232318846.0,
+      "reward": 1.532589316368103,
+      "reward_std": 0.19645501673221588,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.39749833941459656,
+      "step": 1985
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 992.0,
+      "completions/max_terminated_length": 992.0,
+      "completions/mean_length": 461.2589416503906,
+      "completions/mean_terminated_length": 461.2589416503906,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.0495228269280372,
+      "grad_norm": 0.7752722501754761,
+      "kl": 0.1173095703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0036,
+      "num_tokens": 232438113.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.2078167051076889,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37142857909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.42483246326446533,
+      "step": 1986
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 925.0,
+      "completions/max_terminated_length": 925.0,
+      "completions/mean_length": 445.4464416503906,
+      "completions/mean_terminated_length": 445.4464416503906,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.050554552489038,
+      "grad_norm": 0.7875111103057861,
+      "kl": 0.125244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 232558479.0,
+      "reward": 1.469642996788025,
+      "reward_std": 0.20513944327831268,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.500376284122467,
+      "step": 1987
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1101.0,
+      "completions/max_terminated_length": 1101.0,
+      "completions/mean_length": 458.3482360839844,
+      "completions/mean_terminated_length": 458.3482360839844,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.051586278050039,
+      "grad_norm": 0.9260916113853455,
+      "kl": 0.1683349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 232678743.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.2220853716135025,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38705354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.4021756947040558,
+      "step": 1988
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 469.01788330078125,
+      "completions/mean_terminated_length": 469.01788330078125,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.0526180036110393,
+      "grad_norm": 0.8016949892044067,
+      "kl": 0.11279296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0059,
+      "num_tokens": 232797059.0,
+      "reward": 1.4160715341567993,
+      "reward_std": 0.24108101427555084,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41607141494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.40842556953430176,
+      "step": 1989
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 938.0,
+      "completions/max_terminated_length": 938.0,
+      "completions/mean_length": 449.46429443359375,
+      "completions/mean_terminated_length": 449.46429443359375,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 2.05364972917204,
+      "grad_norm": 0.7085789442062378,
+      "kl": 0.135009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0379,
+      "num_tokens": 232920269.0,
+      "reward": 1.4468750953674316,
+      "reward_std": 0.18384109437465668,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4558035731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4252050817012787,
+      "step": 1990
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 872.0,
+      "completions/max_terminated_length": 872.0,
+      "completions/mean_length": 414.51788330078125,
+      "completions/mean_terminated_length": 414.51788330078125,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 2.054681454733041,
+      "grad_norm": 0.7596367597579956,
+      "kl": 0.1248779296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0057,
+      "num_tokens": 233034840.0,
+      "reward": 1.6830357313156128,
+      "reward_std": 0.19633789360523224,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6830357313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.3981688916683197,
+      "step": 1991
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 818.0,
+      "completions/max_terminated_length": 818.0,
+      "completions/mean_length": 434.7321472167969,
+      "completions/mean_terminated_length": 434.7321472167969,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 2.055713180294042,
+      "grad_norm": 0.7070798277854919,
+      "kl": 0.128173828125,
+      "learning_rate": 1e-06,
+      "loss": -0.048,
+      "num_tokens": 233153936.0,
+      "reward": 1.474107265472412,
+      "reward_std": 0.1844610571861267,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48303571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.44309282302856445,
+      "step": 1992
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1014.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 471.20538330078125,
+      "completions/mean_terminated_length": 471.20538330078125,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 2.0567449058550427,
+      "grad_norm": 0.7285985946655273,
+      "kl": 0.1134033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0097,
+      "num_tokens": 233285020.0,
+      "reward": 1.352678656578064,
+      "reward_std": 0.2566676139831543,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3705357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.3969224691390991,
+      "step": 1993
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1145.0,
+      "completions/max_terminated_length": 1145.0,
+      "completions/mean_length": 456.9285888671875,
+      "completions/mean_terminated_length": 456.9285888671875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 2.0577766314160435,
+      "grad_norm": 0.6904830932617188,
+      "kl": 0.12255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0011,
+      "num_tokens": 233403886.0,
+      "reward": 1.4937502145767212,
+      "reward_std": 0.22515329718589783,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5205357670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.43436571955680847,
+      "step": 1994
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 885.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 435.01788330078125,
+      "completions/mean_terminated_length": 435.01788330078125,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.058808356977044,
+      "grad_norm": 0.8098060488700867,
+      "kl": 0.121337890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0046,
+      "num_tokens": 233517442.0,
+      "reward": 1.4343750476837158,
+      "reward_std": 0.31481003761291504,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46116071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.44442036747932434,
+      "step": 1995
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 871.0,
+      "completions/mean_length": 471.8482360839844,
+      "completions/mean_terminated_length": 439.1982116699219,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 2.0598400825380447,
+      "grad_norm": 0.8379706740379333,
+      "kl": 0.1239013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 233646750.0,
+      "reward": 1.5308037996292114,
+      "reward_std": 0.30953019857406616,
+      "rewards/code_format_reward/mean": 0.9375,
+      "rewards/code_format_reward/std": 0.24314938485622406,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5933036208152771,
+      "rewards/curriculum_aware_reward_fn/std": 0.4207723140716553,
+      "step": 1996
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1083.0,
+      "completions/max_terminated_length": 1083.0,
+      "completions/mean_length": 415.5446472167969,
+      "completions/mean_terminated_length": 415.5446472167969,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 2.0608718080990456,
+      "grad_norm": 0.7319987416267395,
+      "kl": 0.1361083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0073,
+      "num_tokens": 233756263.0,
+      "reward": 1.6250001192092896,
+      "reward_std": 0.22156484425067902,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6428571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4175139367580414,
+      "step": 1997
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 825.0,
+      "completions/max_terminated_length": 825.0,
+      "completions/mean_length": 432.5089416503906,
+      "completions/mean_terminated_length": 432.5089416503906,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 2.0619035336600464,
+      "grad_norm": 0.8287761807441711,
+      "kl": 0.12109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0228,
+      "num_tokens": 233869881.0,
+      "reward": 1.5004465579986572,
+      "reward_std": 0.25090718269348145,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5004464387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.433012455701828,
+      "step": 1998
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 394.9910888671875,
+      "completions/mean_terminated_length": 394.9910888671875,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 2.0629352592210473,
+      "grad_norm": 0.6038341522216797,
+      "kl": 0.13330078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0148,
+      "num_tokens": 233975223.0,
+      "reward": 1.5892857313156128,
+      "reward_std": 0.1025947853922844,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5892857313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.43709877133369446,
+      "step": 1999
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 809.0,
+      "completions/max_terminated_length": 809.0,
+      "completions/mean_length": 432.83038330078125,
+      "completions/mean_terminated_length": 432.83038330078125,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 2.063966984782048,
+      "grad_norm": 0.7323113083839417,
+      "kl": 0.132568359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 234087932.0,
+      "reward": 1.4352679252624512,
+      "reward_std": 0.2148611694574356,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4441964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.41675201058387756,
+      "step": 2000
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 878.0,
+      "completions/max_terminated_length": 878.0,
+      "completions/mean_length": 478.8750305175781,
+      "completions/mean_terminated_length": 478.8750305175781,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 2.064998710343049,
+      "grad_norm": 0.9021128416061401,
+      "kl": 0.1153564453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0074,
+      "num_tokens": 234217250.0,
+      "reward": 1.4133931398391724,
+      "reward_std": 0.30901095271110535,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4133928418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.39643582701683044,
+      "step": 2001
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1223.0,
+      "completions/max_terminated_length": 1223.0,
+      "completions/mean_length": 466.5000305175781,
+      "completions/mean_terminated_length": 466.5000305175781,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.0660304359040493,
+      "grad_norm": 0.7210460901260376,
+      "kl": 0.1270751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 234334227.0,
+      "reward": 1.4799107313156128,
+      "reward_std": 0.15783853828907013,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4799107015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.4287768006324768,
+      "step": 2002
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 917.0,
+      "completions/max_terminated_length": 917.0,
+      "completions/mean_length": 430.83038330078125,
+      "completions/mean_terminated_length": 430.83038330078125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 2.06706216146505,
+      "grad_norm": 0.7734282612800598,
+      "kl": 0.11767578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0169,
+      "num_tokens": 234455770.0,
+      "reward": 1.5022321939468384,
+      "reward_std": 0.2398114651441574,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5022321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4340978264808655,
+      "step": 2003
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1012.0,
+      "completions/max_terminated_length": 1012.0,
+      "completions/mean_length": 408.83929443359375,
+      "completions/mean_terminated_length": 408.83929443359375,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 2.068093887026051,
+      "grad_norm": 0.5895684957504272,
+      "kl": 0.117431640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0146,
+      "num_tokens": 234568732.0,
+      "reward": 1.520982265472412,
+      "reward_std": 0.11030647158622742,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5209821462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.46916821599006653,
+      "step": 2004
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1336.0,
+      "completions/max_terminated_length": 1336.0,
+      "completions/mean_length": 442.08929443359375,
+      "completions/mean_terminated_length": 442.08929443359375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 2.069125612587052,
+      "grad_norm": 0.7536056041717529,
+      "kl": 0.1180419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0247,
+      "num_tokens": 234681575.0,
+      "reward": 1.4781250953674316,
+      "reward_std": 0.18545761704444885,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47812503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.39179542660713196,
+      "step": 2005
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1529.0,
+      "completions/max_terminated_length": 1529.0,
+      "completions/mean_length": 507.4464416503906,
+      "completions/mean_terminated_length": 507.4464416503906,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 2.0701573381480527,
+      "grad_norm": 0.7618375420570374,
+      "kl": 0.119873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0213,
+      "num_tokens": 234820911.0,
+      "reward": 1.3924107551574707,
+      "reward_std": 0.2358359545469284,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3924107253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.4042683243751526,
+      "step": 2006
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1183.0,
+      "completions/max_terminated_length": 1183.0,
+      "completions/mean_length": 433.6071472167969,
+      "completions/mean_terminated_length": 433.6071472167969,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 2.0711890637090535,
+      "grad_norm": 0.5803200602531433,
+      "kl": 0.1112060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 234936976.0,
+      "reward": 1.5857144594192505,
+      "reward_std": 0.173648864030838,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5857143402099609,
+      "rewards/curriculum_aware_reward_fn/std": 0.4373563528060913,
+      "step": 2007
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 749.0,
+      "completions/max_terminated_length": 749.0,
+      "completions/mean_length": 407.9196472167969,
+      "completions/mean_terminated_length": 407.9196472167969,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 2.0722207892700544,
+      "grad_norm": 0.698860764503479,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0149,
+      "num_tokens": 235052831.0,
+      "reward": 1.5660717487335205,
+      "reward_std": 0.14689664542675018,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5660714507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.3505603075027466,
+      "step": 2008
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 435.4821472167969,
+      "completions/mean_terminated_length": 435.4821472167969,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.0732525148310548,
+      "grad_norm": 0.8031901121139526,
+      "kl": 0.119873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0093,
+      "num_tokens": 235164283.0,
+      "reward": 1.432142972946167,
+      "reward_std": 0.250482439994812,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43214288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.4090671241283417,
+      "step": 2009
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 915.0,
+      "completions/max_terminated_length": 915.0,
+      "completions/mean_length": 454.732177734375,
+      "completions/mean_terminated_length": 454.732177734375,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 2.0742842403920556,
+      "grad_norm": 0.8125240206718445,
+      "kl": 0.106201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0243,
+      "num_tokens": 235279816.0,
+      "reward": 1.497321605682373,
+      "reward_std": 0.28393781185150146,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4973214268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.4544989764690399,
+      "step": 2010
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 800.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 414.58038330078125,
+      "completions/mean_terminated_length": 414.58038330078125,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.0753159659530565,
+      "grad_norm": 0.7212795615196228,
+      "kl": 0.117919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 235396557.0,
+      "reward": 1.5700894594192505,
+      "reward_std": 0.16203486919403076,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5700892806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.4427829384803772,
+      "step": 2011
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 763.0,
+      "completions/max_terminated_length": 763.0,
+      "completions/mean_length": 412.65179443359375,
+      "completions/mean_terminated_length": 412.65179443359375,
+      "completions/min_length": 149.0,
+      "completions/min_terminated_length": 149.0,
+      "epoch": 2.0763476915140573,
+      "grad_norm": 0.7393515706062317,
+      "kl": 0.125,
+      "learning_rate": 1e-06,
+      "loss": 0.0214,
+      "num_tokens": 235513838.0,
+      "reward": 1.594642996788025,
+      "reward_std": 0.22776253521442413,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5946429371833801,
+      "rewards/curriculum_aware_reward_fn/std": 0.4215136468410492,
+      "step": 2012
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 868.0,
+      "completions/max_terminated_length": 868.0,
+      "completions/mean_length": 470.5625305175781,
+      "completions/mean_terminated_length": 470.5625305175781,
+      "completions/min_length": 286.0,
+      "completions/min_terminated_length": 286.0,
+      "epoch": 2.077379417075058,
+      "grad_norm": 0.6410515308380127,
+      "kl": 0.1263427734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0024,
+      "num_tokens": 235642337.0,
+      "reward": 1.4433037042617798,
+      "reward_std": 0.1228586882352829,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44330358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.4229079782962799,
+      "step": 2013
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 675.0,
+      "completions/max_terminated_length": 675.0,
+      "completions/mean_length": 411.9196472167969,
+      "completions/mean_terminated_length": 411.9196472167969,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.078411142636059,
+      "grad_norm": 0.6773447394371033,
+      "kl": 0.10888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0056,
+      "num_tokens": 235749351.0,
+      "reward": 1.645982265472412,
+      "reward_std": 0.17768552899360657,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6459822058677673,
+      "rewards/curriculum_aware_reward_fn/std": 0.4375988245010376,
+      "step": 2014
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1242.0,
+      "completions/max_terminated_length": 1242.0,
+      "completions/mean_length": 443.4285888671875,
+      "completions/mean_terminated_length": 443.4285888671875,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 2.0794428681970594,
+      "grad_norm": 0.6409063935279846,
+      "kl": 0.1204833984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 235868154.0,
+      "reward": 1.4526787996292114,
+      "reward_std": 0.10559501498937607,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45267853140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.45071709156036377,
+      "step": 2015
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1450.0,
+      "completions/max_terminated_length": 1450.0,
+      "completions/mean_length": 473.3750305175781,
+      "completions/mean_terminated_length": 473.3750305175781,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 2.08047459375806,
+      "grad_norm": 0.7845041155815125,
+      "kl": 0.11279296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0071,
+      "num_tokens": 235984816.0,
+      "reward": 1.485267996788025,
+      "reward_std": 0.2349657416343689,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48526784777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.40289705991744995,
+      "step": 2016
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1621.0,
+      "completions/max_terminated_length": 1621.0,
+      "completions/mean_length": 451.919677734375,
+      "completions/mean_terminated_length": 451.919677734375,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 2.081506319319061,
+      "grad_norm": 0.7908880114555359,
+      "kl": 0.1138916015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0077,
+      "num_tokens": 236089653.0,
+      "reward": 1.6111608743667603,
+      "reward_std": 0.31487926840782166,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6111606955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.5522969961166382,
+      "step": 2017
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1074.0,
+      "completions/max_terminated_length": 1074.0,
+      "completions/mean_length": 419.4464416503906,
+      "completions/mean_terminated_length": 419.4464416503906,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.082538044880062,
+      "grad_norm": 0.7708176970481873,
+      "kl": 0.1177978515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0039,
+      "num_tokens": 236195263.0,
+      "reward": 1.563392996788025,
+      "reward_std": 0.19081752002239227,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5633928179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.42452844977378845,
+      "step": 2018
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 433.5714416503906,
+      "completions/mean_terminated_length": 433.5714416503906,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.0835697704410627,
+      "grad_norm": 0.6971049904823303,
+      "kl": 0.120849609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0019,
+      "num_tokens": 236316273.0,
+      "reward": 1.4839287996292114,
+      "reward_std": 0.2088971883058548,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4839285910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.4364173710346222,
+      "step": 2019
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 876.0,
+      "completions/max_terminated_length": 876.0,
+      "completions/mean_length": 457.169677734375,
+      "completions/mean_terminated_length": 457.169677734375,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.0846014960020636,
+      "grad_norm": 0.8070178031921387,
+      "kl": 0.1220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0044,
+      "num_tokens": 236427795.0,
+      "reward": 1.4075894355773926,
+      "reward_std": 0.20527462661266327,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40758928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3741181790828705,
+      "step": 2020
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 693.0,
+      "completions/max_terminated_length": 693.0,
+      "completions/mean_length": 405.8125305175781,
+      "completions/mean_terminated_length": 405.8125305175781,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 2.0856332215630644,
+      "grad_norm": 0.7929508090019226,
+      "kl": 0.124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0199,
+      "num_tokens": 236541265.0,
+      "reward": 1.5303572416305542,
+      "reward_std": 0.1638633906841278,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5303571820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.37837710976600647,
+      "step": 2021
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 718.0,
+      "completions/max_terminated_length": 718.0,
+      "completions/mean_length": 416.3571472167969,
+      "completions/mean_terminated_length": 416.3571472167969,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.086664947124065,
+      "grad_norm": 0.8954543471336365,
+      "kl": 0.12060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0046,
+      "num_tokens": 236653214.0,
+      "reward": 1.5348213911056519,
+      "reward_std": 0.2690960764884949,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5348214507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.4390222132205963,
+      "step": 2022
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1526.0,
+      "completions/max_terminated_length": 1526.0,
+      "completions/mean_length": 454.2589416503906,
+      "completions/mean_terminated_length": 454.2589416503906,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.0876966726850656,
+      "grad_norm": 0.7094557881355286,
+      "kl": 0.1168212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 236776177.0,
+      "reward": 1.5254465341567993,
+      "reward_std": 0.1793346107006073,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.4094049632549286,
+      "step": 2023
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 883.0,
+      "completions/max_terminated_length": 883.0,
+      "completions/mean_length": 429.1607360839844,
+      "completions/mean_terminated_length": 429.1607360839844,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.0887283982460665,
+      "grad_norm": 0.740519642829895,
+      "kl": 0.1337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0209,
+      "num_tokens": 236897029.0,
+      "reward": 1.4156250953674316,
+      "reward_std": 0.18167220056056976,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4156250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.4546571373939514,
+      "step": 2024
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 702.0,
+      "completions/max_terminated_length": 702.0,
+      "completions/mean_length": 409.5535888671875,
+      "completions/mean_terminated_length": 409.5535888671875,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.0897601238070673,
+      "grad_norm": 0.6365262269973755,
+      "kl": 0.1259765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0217,
+      "num_tokens": 237009742.0,
+      "reward": 1.567857265472412,
+      "reward_std": 0.15161246061325073,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5678571462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.4445291757583618,
+      "step": 2025
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 796.0,
+      "completions/mean_length": 427.3750305175781,
+      "completions/mean_terminated_length": 394.3243408203125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 2.090791849368068,
+      "grad_norm": 0.6970322728157043,
+      "kl": 0.1146240234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0093,
+      "num_tokens": 237133222.0,
+      "reward": 1.6142858266830444,
+      "reward_std": 0.21715456247329712,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6142857670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.41754475235939026,
+      "step": 2026
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 957.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 464.8750305175781,
+      "completions/mean_terminated_length": 464.8750305175781,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 2.091823574929069,
+      "grad_norm": 0.807088315486908,
+      "kl": 0.113037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0205,
+      "num_tokens": 237248708.0,
+      "reward": 1.5125001668930054,
+      "reward_std": 0.19637462496757507,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.512499988079071,
+      "rewards/curriculum_aware_reward_fn/std": 0.4132108986377716,
+      "step": 2027
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1139.0,
+      "completions/max_terminated_length": 1139.0,
+      "completions/mean_length": 436.0535888671875,
+      "completions/mean_terminated_length": 436.0535888671875,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 2.09285530049007,
+      "grad_norm": 0.6729896068572998,
+      "kl": 0.120849609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 237369429.0,
+      "reward": 1.4919644594192505,
+      "reward_std": 0.17199517786502838,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5008928775787354,
+      "rewards/curriculum_aware_reward_fn/std": 0.42304036021232605,
+      "step": 2028
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 938.0,
+      "completions/max_terminated_length": 938.0,
+      "completions/mean_length": 395.83038330078125,
+      "completions/mean_terminated_length": 395.83038330078125,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.0938870260510702,
+      "grad_norm": 0.7247878909111023,
+      "kl": 0.12548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 237475054.0,
+      "reward": 1.5147322416305542,
+      "reward_std": 0.18124648928642273,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5236607193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.444939523935318,
+      "step": 2029
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1456.0,
+      "completions/max_terminated_length": 1456.0,
+      "completions/mean_length": 464.0000305175781,
+      "completions/mean_terminated_length": 464.0000305175781,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 2.094918751612071,
+      "grad_norm": 0.7838840484619141,
+      "kl": 0.1431884765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0104,
+      "num_tokens": 237598926.0,
+      "reward": 1.471428632736206,
+      "reward_std": 0.2576696574687958,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48035717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.41931721568107605,
+      "step": 2030
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 701.0,
+      "completions/max_terminated_length": 701.0,
+      "completions/mean_length": 399.26788330078125,
+      "completions/mean_terminated_length": 399.26788330078125,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 2.095950477173072,
+      "grad_norm": 0.8283127546310425,
+      "kl": 0.12158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0458,
+      "num_tokens": 237712524.0,
+      "reward": 1.4522322416305542,
+      "reward_std": 0.22037231922149658,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.444606751203537,
+      "step": 2031
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2448.0,
+      "completions/max_terminated_length": 2448.0,
+      "completions/mean_length": 399.4821472167969,
+      "completions/mean_terminated_length": 399.4821472167969,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.0969822027340728,
+      "grad_norm": 0.612908124923706,
+      "kl": 0.1302490234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0107,
+      "num_tokens": 237829779.0,
+      "reward": 1.6995537281036377,
+      "reward_std": 0.09788351505994797,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6995535492897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.3540305495262146,
+      "step": 2032
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1341.0,
+      "completions/max_terminated_length": 1341.0,
+      "completions/mean_length": 444.6875305175781,
+      "completions/mean_terminated_length": 444.6875305175781,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 2.0980139282950736,
+      "grad_norm": 0.6735214591026306,
+      "kl": 0.1121826171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0063,
+      "num_tokens": 237948653.0,
+      "reward": 1.4866071939468384,
+      "reward_std": 0.1361234039068222,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4866071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3955257534980774,
+      "step": 2033
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 843.0,
+      "completions/mean_length": 419.6160888671875,
+      "completions/mean_terminated_length": 386.4955139160156,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 2.0990456538560744,
+      "grad_norm": 0.6528956294059753,
+      "kl": 0.118896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0572,
+      "num_tokens": 238063018.0,
+      "reward": 1.5607143640518188,
+      "reward_std": 0.12395340204238892,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5696428418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4645717144012451,
+      "step": 2034
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1086.0,
+      "completions/max_terminated_length": 1086.0,
+      "completions/mean_length": 473.83038330078125,
+      "completions/mean_terminated_length": 473.83038330078125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.1000773794170753,
+      "grad_norm": 0.7285524606704712,
+      "kl": 0.1185302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0201,
+      "num_tokens": 238190441.0,
+      "reward": 1.4928573369979858,
+      "reward_std": 0.1570131480693817,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5017856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.4277496337890625,
+      "step": 2035
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1259.0,
+      "completions/max_terminated_length": 1259.0,
+      "completions/mean_length": 423.5982360839844,
+      "completions/mean_terminated_length": 423.5982360839844,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 2.1011091049780757,
+      "grad_norm": 0.9682310819625854,
+      "kl": 0.1566162109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0046,
+      "num_tokens": 238303852.0,
+      "reward": 1.406250238418579,
+      "reward_std": 0.1819208562374115,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40625,
+      "rewards/curriculum_aware_reward_fn/std": 0.3843284249305725,
+      "step": 2036
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1290.0,
+      "completions/max_terminated_length": 1290.0,
+      "completions/mean_length": 475.8660888671875,
+      "completions/mean_terminated_length": 475.8660888671875,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 2.1021408305390765,
+      "grad_norm": 0.7141610980033875,
+      "kl": 0.1177978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0045,
+      "num_tokens": 238430978.0,
+      "reward": 1.3500001430511475,
+      "reward_std": 0.1426495909690857,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3537444472312927,
+      "step": 2037
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 637.0,
+      "completions/max_terminated_length": 637.0,
+      "completions/mean_length": 372.9375305175781,
+      "completions/mean_terminated_length": 372.9375305175781,
+      "completions/min_length": 82.0,
+      "completions/min_terminated_length": 82.0,
+      "epoch": 2.1031725561000774,
+      "grad_norm": 0.9728583693504333,
+      "kl": 0.14453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0047,
+      "num_tokens": 238533472.0,
+      "reward": 1.427232265472412,
+      "reward_std": 0.24139705300331116,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.4519706070423126,
+      "step": 2038
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 426.01788330078125,
+      "completions/mean_terminated_length": 426.01788330078125,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 2.104204281661078,
+      "grad_norm": 0.7383515238761902,
+      "kl": 0.1279296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0031,
+      "num_tokens": 238658921.0,
+      "reward": 1.6232143640518188,
+      "reward_std": 0.18711508810520172,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6232143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.383260577917099,
+      "step": 2039
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 418.7500305175781,
+      "completions/mean_terminated_length": 418.7500305175781,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.105236007222079,
+      "grad_norm": 0.5161622166633606,
+      "kl": 0.122314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 238778348.0,
+      "reward": 1.4593751430511475,
+      "reward_std": 0.08913283795118332,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4593749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.4484975337982178,
+      "step": 2040
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1151.0,
+      "completions/max_terminated_length": 1151.0,
+      "completions/mean_length": 435.6250305175781,
+      "completions/mean_terminated_length": 435.6250305175781,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 2.10626773278308,
+      "grad_norm": 0.7753861546516418,
+      "kl": 0.11962890625,
+      "learning_rate": 1e-06,
+      "loss": -0.001,
+      "num_tokens": 238894898.0,
+      "reward": 1.5455358028411865,
+      "reward_std": 0.2276323437690735,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.545535683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.4044545590877533,
+      "step": 2041
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 826.0,
+      "completions/max_terminated_length": 826.0,
+      "completions/mean_length": 399.9821472167969,
+      "completions/mean_terminated_length": 399.9821472167969,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 2.1072994583440803,
+      "grad_norm": 0.804185152053833,
+      "kl": 0.1265869140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0039,
+      "num_tokens": 239016682.0,
+      "reward": 1.6325894594192505,
+      "reward_std": 0.16231726109981537,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6325892806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.5415073037147522,
+      "step": 2042
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 421.89288330078125,
+      "completions/mean_terminated_length": 421.89288330078125,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 2.108331183905081,
+      "grad_norm": 0.7326617240905762,
+      "kl": 0.12353515625,
+      "learning_rate": 1e-06,
+      "loss": -0.003,
+      "num_tokens": 239130959.0,
+      "reward": 1.5669643878936768,
+      "reward_std": 0.2607947587966919,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5669642686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.41397616267204285,
+      "step": 2043
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1036.0,
+      "completions/max_terminated_length": 1036.0,
+      "completions/mean_length": 444.08929443359375,
+      "completions/mean_terminated_length": 444.08929443359375,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 2.109362909466082,
+      "grad_norm": 0.737518310546875,
+      "kl": 0.1077880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 239252202.0,
+      "reward": 1.4821429252624512,
+      "reward_std": 0.20218975841999054,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4821428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.46172693371772766,
+      "step": 2044
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1561.0,
+      "completions/max_terminated_length": 1561.0,
+      "completions/mean_length": 468.4375305175781,
+      "completions/mean_terminated_length": 468.4375305175781,
+      "completions/min_length": 145.0,
+      "completions/min_terminated_length": 145.0,
+      "epoch": 2.110394635027083,
+      "grad_norm": 0.7885037064552307,
+      "kl": 0.1168212890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0091,
+      "num_tokens": 239374653.0,
+      "reward": 1.3750001192092896,
+      "reward_std": 0.17816029489040375,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.375,
+      "rewards/curriculum_aware_reward_fn/std": 0.380788654088974,
+      "step": 2045
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1079.0,
+      "completions/max_terminated_length": 1079.0,
+      "completions/mean_length": 401.33929443359375,
+      "completions/mean_terminated_length": 401.33929443359375,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.1114263605880836,
+      "grad_norm": 0.8109065890312195,
+      "kl": 0.134033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 239493635.0,
+      "reward": 1.4455360174179077,
+      "reward_std": 0.23022522032260895,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3891867995262146,
+      "step": 2046
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1016.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 445.1964416503906,
+      "completions/mean_terminated_length": 445.1964416503906,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.1124580861490845,
+      "grad_norm": 0.7626376152038574,
+      "kl": 0.131103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.025,
+      "num_tokens": 239622688.0,
+      "reward": 1.3906251192092896,
+      "reward_std": 0.18730053305625916,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.390625,
+      "rewards/curriculum_aware_reward_fn/std": 0.41970235109329224,
+      "step": 2047
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1603.0,
+      "completions/max_terminated_length": 1603.0,
+      "completions/mean_length": 445.26788330078125,
+      "completions/mean_terminated_length": 445.26788330078125,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 2.1134898117100853,
+      "grad_norm": 0.7581174373626709,
+      "kl": 0.1285400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 239731995.0,
+      "reward": 1.6513394117355347,
+      "reward_std": 0.24180802702903748,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6691964268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.39107412099838257,
+      "step": 2048
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 814.0,
+      "completions/max_terminated_length": 814.0,
+      "completions/mean_length": 421.0625305175781,
+      "completions/mean_terminated_length": 421.0625305175781,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 2.1145215372710857,
+      "grad_norm": 0.8369876742362976,
+      "kl": 0.108642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0334,
+      "num_tokens": 239841048.0,
+      "reward": 1.5040180683135986,
+      "reward_std": 0.18611615896224976,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5040178894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.42315515875816345,
+      "step": 2049
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1976.0,
+      "completions/max_terminated_length": 1976.0,
+      "completions/mean_length": 483.107177734375,
+      "completions/mean_terminated_length": 483.107177734375,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 2.1155532628320866,
+      "grad_norm": 0.6806438565254211,
+      "kl": 0.10986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 239965016.0,
+      "reward": 1.3593751192092896,
+      "reward_std": 0.1829347163438797,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.38082006573677063,
+      "step": 2050
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1345.0,
+      "completions/max_terminated_length": 1345.0,
+      "completions/mean_length": 433.40179443359375,
+      "completions/mean_terminated_length": 433.40179443359375,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 2.1165849883930874,
+      "grad_norm": 0.7132896780967712,
+      "kl": 0.12939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 240086000.0,
+      "reward": 1.2906250953674316,
+      "reward_std": 0.14403748512268066,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.37473902106285095,
+      "step": 2051
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1810.0,
+      "completions/max_terminated_length": 1810.0,
+      "completions/mean_length": 426.9732360839844,
+      "completions/mean_terminated_length": 426.9732360839844,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.1176167139540882,
+      "grad_norm": 0.8702093958854675,
+      "kl": 0.135009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0112,
+      "num_tokens": 240196555.0,
+      "reward": 1.5214285850524902,
+      "reward_std": 0.2170983999967575,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5214285850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.41572222113609314,
+      "step": 2052
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3992.0,
+      "completions/max_terminated_length": 3992.0,
+      "completions/mean_length": 469.9107360839844,
+      "completions/mean_terminated_length": 469.9107360839844,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 2.118648439515089,
+      "grad_norm": 0.5609109997749329,
+      "kl": 0.104248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0232,
+      "num_tokens": 240318044.0,
+      "reward": 1.4709821939468384,
+      "reward_std": 0.15642313659191132,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4709821343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.43281179666519165,
+      "step": 2053
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 916.0,
+      "completions/max_terminated_length": 916.0,
+      "completions/mean_length": 433.96429443359375,
+      "completions/mean_terminated_length": 433.96429443359375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 2.11968016507609,
+      "grad_norm": 0.693916916847229,
+      "kl": 0.1099853515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0154,
+      "num_tokens": 240436529.0,
+      "reward": 1.395535945892334,
+      "reward_std": 0.15839314460754395,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3955357074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.40372997522354126,
+      "step": 2054
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3397.0,
+      "completions/max_terminated_length": 3397.0,
+      "completions/mean_length": 470.0357360839844,
+      "completions/mean_terminated_length": 470.0357360839844,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 2.1207118906370903,
+      "grad_norm": 0.8244123458862305,
+      "kl": 0.1292724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0302,
+      "num_tokens": 240553846.0,
+      "reward": 1.442857265472412,
+      "reward_std": 0.24676746129989624,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44285711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.4205774962902069,
+      "step": 2055
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 841.0,
+      "completions/max_terminated_length": 841.0,
+      "completions/mean_length": 408.08038330078125,
+      "completions/mean_terminated_length": 408.08038330078125,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 2.121743616198091,
+      "grad_norm": 0.5505373477935791,
+      "kl": 0.1217041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0145,
+      "num_tokens": 240662348.0,
+      "reward": 1.4209821224212646,
+      "reward_std": 0.1188260167837143,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42991071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.48632004857063293,
+      "step": 2056
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 894.0,
+      "completions/max_terminated_length": 894.0,
+      "completions/mean_length": 449.107177734375,
+      "completions/mean_terminated_length": 449.107177734375,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 2.122775341759092,
+      "grad_norm": 0.638522744178772,
+      "kl": 0.1121826171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0101,
+      "num_tokens": 240784188.0,
+      "reward": 1.4647324085235596,
+      "reward_std": 0.15416447818279266,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46473217010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.4712279438972473,
+      "step": 2057
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1054.0,
+      "completions/max_terminated_length": 1054.0,
+      "completions/mean_length": 441.0089416503906,
+      "completions/mean_terminated_length": 441.0089416503906,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 2.123807067320093,
+      "grad_norm": 0.6578857898712158,
+      "kl": 0.1224365234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0172,
+      "num_tokens": 240898730.0,
+      "reward": 1.524553656578064,
+      "reward_std": 0.16847702860832214,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5245535969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.4525202214717865,
+      "step": 2058
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 478.2589416503906,
+      "completions/mean_terminated_length": 445.66668701171875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.1248387928810937,
+      "grad_norm": 0.8479138016700745,
+      "kl": 0.122802734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0364,
+      "num_tokens": 241023436.0,
+      "reward": 1.4089287519454956,
+      "reward_std": 0.23351380228996277,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41785717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.4163254499435425,
+      "step": 2059
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 862.0,
+      "completions/max_terminated_length": 862.0,
+      "completions/mean_length": 447.794677734375,
+      "completions/mean_terminated_length": 447.794677734375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.1258705184420945,
+      "grad_norm": 0.833861231803894,
+      "kl": 0.1229248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.011,
+      "num_tokens": 241152024.0,
+      "reward": 1.502678632736206,
+      "reward_std": 0.2115645557641983,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5026785135269165,
+      "rewards/curriculum_aware_reward_fn/std": 0.42414936423301697,
+      "step": 2060
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2355.0,
+      "completions/max_terminated_length": 2355.0,
+      "completions/mean_length": 469.5982360839844,
+      "completions/mean_terminated_length": 469.5982360839844,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 2.1269022440030954,
+      "grad_norm": 0.8299174308776855,
+      "kl": 0.117919921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 241280527.0,
+      "reward": 1.3687500953674316,
+      "reward_std": 0.2414630949497223,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36875003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.40576502680778503,
+      "step": 2061
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 928.0,
+      "completions/max_terminated_length": 928.0,
+      "completions/mean_length": 439.45538330078125,
+      "completions/mean_terminated_length": 439.45538330078125,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.1279339695640958,
+      "grad_norm": 0.8200660347938538,
+      "kl": 0.1226806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0451,
+      "num_tokens": 241400677.0,
+      "reward": 1.559821605682373,
+      "reward_std": 0.18447676301002502,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5687499642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.44337591528892517,
+      "step": 2062
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1236.0,
+      "completions/max_terminated_length": 1236.0,
+      "completions/mean_length": 442.3482360839844,
+      "completions/mean_terminated_length": 442.3482360839844,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 2.1289656951250966,
+      "grad_norm": 0.7119675874710083,
+      "kl": 0.1207275390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0318,
+      "num_tokens": 241512618.0,
+      "reward": 1.4754464626312256,
+      "reward_std": 0.16743336617946625,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4754464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.4168195426464081,
+      "step": 2063
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3617.0,
+      "completions/max_terminated_length": 3617.0,
+      "completions/mean_length": 447.7232360839844,
+      "completions/mean_terminated_length": 447.7232360839844,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 2.1299974206860974,
+      "grad_norm": 0.7517739534378052,
+      "kl": 0.1259765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0033,
+      "num_tokens": 241629704.0,
+      "reward": 1.4156250953674316,
+      "reward_std": 0.2064957469701767,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41562503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.4368721544742584,
+      "step": 2064
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 848.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 428.2232360839844,
+      "completions/mean_terminated_length": 428.2232360839844,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 2.1310291462470983,
+      "grad_norm": 0.8079065084457397,
+      "kl": 0.1055908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 241745803.0,
+      "reward": 1.557142972946167,
+      "reward_std": 0.23306429386138916,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5571428537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.4445291757583618,
+      "step": 2065
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1499.0,
+      "completions/max_terminated_length": 1499.0,
+      "completions/mean_length": 421.27679443359375,
+      "completions/mean_terminated_length": 421.27679443359375,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.132060871808099,
+      "grad_norm": 0.6699889898300171,
+      "kl": 0.110595703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0165,
+      "num_tokens": 241848482.0,
+      "reward": 1.5254465341567993,
+      "reward_std": 0.1498248130083084,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.4337140917778015,
+      "step": 2066
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 769.0,
+      "completions/max_terminated_length": 769.0,
+      "completions/mean_length": 399.4910888671875,
+      "completions/mean_terminated_length": 399.4910888671875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.1330925973691,
+      "grad_norm": 0.8710476756095886,
+      "kl": 0.1302490234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0048,
+      "num_tokens": 241960292.0,
+      "reward": 1.5870537757873535,
+      "reward_std": 0.24912148714065552,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5870535969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.40077322721481323,
+      "step": 2067
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1508.0,
+      "completions/max_terminated_length": 1508.0,
+      "completions/mean_length": 426.8839416503906,
+      "completions/mean_terminated_length": 426.8839416503906,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 2.134124322930101,
+      "grad_norm": 0.8446362018585205,
+      "kl": 0.1260986328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0197,
+      "num_tokens": 242076640.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.23388829827308655,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.409062922000885,
+      "step": 2068
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1901.0,
+      "completions/max_terminated_length": 1901.0,
+      "completions/mean_length": 467.107177734375,
+      "completions/mean_terminated_length": 467.107177734375,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 2.135156048491101,
+      "grad_norm": 0.7577447891235352,
+      "kl": 0.1160888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0406,
+      "num_tokens": 242198655.0,
+      "reward": 1.422767996788025,
+      "reward_std": 0.17161372303962708,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43169641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.4034197926521301,
+      "step": 2069
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 908.0,
+      "completions/max_terminated_length": 908.0,
+      "completions/mean_length": 400.39288330078125,
+      "completions/mean_terminated_length": 400.39288330078125,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 2.136187774052102,
+      "grad_norm": 0.7759943008422852,
+      "kl": 0.122314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0061,
+      "num_tokens": 242318343.0,
+      "reward": 1.6142858266830444,
+      "reward_std": 0.18312881886959076,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6142857670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.43409988284111023,
+      "step": 2070
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1280.0,
+      "completions/max_terminated_length": 1280.0,
+      "completions/mean_length": 456.2857360839844,
+      "completions/mean_terminated_length": 456.2857360839844,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 2.137219499613103,
+      "grad_norm": 0.6259797811508179,
+      "kl": 0.1077880859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0289,
+      "num_tokens": 242438831.0,
+      "reward": 1.5111607313156128,
+      "reward_std": 0.22564736008644104,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5111607313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4776872992515564,
+      "step": 2071
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 976.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 411.9375305175781,
+      "completions/mean_terminated_length": 411.9375305175781,
+      "completions/min_length": 162.0,
+      "completions/min_terminated_length": 162.0,
+      "epoch": 2.1382512251741037,
+      "grad_norm": 0.7962878942489624,
+      "kl": 0.1103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0324,
+      "num_tokens": 242547520.0,
+      "reward": 1.6008931398391724,
+      "reward_std": 0.29236966371536255,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6098214387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.4369138181209564,
+      "step": 2072
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1229.0,
+      "completions/max_terminated_length": 1229.0,
+      "completions/mean_length": 428.51788330078125,
+      "completions/mean_terminated_length": 428.51788330078125,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 2.1392829507351045,
+      "grad_norm": 0.7675960063934326,
+      "kl": 0.126220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 242670449.0,
+      "reward": 1.4281251430511475,
+      "reward_std": 0.2562326490879059,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42812496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.44386619329452515,
+      "step": 2073
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1037.0,
+      "completions/max_terminated_length": 1037.0,
+      "completions/mean_length": 410.9910888671875,
+      "completions/mean_terminated_length": 410.9910888671875,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.1403146762961054,
+      "grad_norm": 0.8732093572616577,
+      "kl": 0.1181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0072,
+      "num_tokens": 242791037.0,
+      "reward": 1.5017858743667603,
+      "reward_std": 0.27640336751937866,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5017856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.4054209887981415,
+      "step": 2074
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1113.0,
+      "completions/max_terminated_length": 1113.0,
+      "completions/mean_length": 438.7232360839844,
+      "completions/mean_terminated_length": 438.7232360839844,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 2.1413464018571062,
+      "grad_norm": 0.7985749244689941,
+      "kl": 0.133056640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0117,
+      "num_tokens": 242901368.0,
+      "reward": 1.5656250715255737,
+      "reward_std": 0.1947113573551178,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.565625011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.41557687520980835,
+      "step": 2075
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1259.0,
+      "completions/max_terminated_length": 1259.0,
+      "completions/mean_length": 455.5625305175781,
+      "completions/mean_terminated_length": 455.5625305175781,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 2.1423781274181066,
+      "grad_norm": 0.7833972573280334,
+      "kl": 0.1212158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 243015321.0,
+      "reward": 1.4928572177886963,
+      "reward_std": 0.22096051275730133,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4928571283817291,
+      "rewards/curriculum_aware_reward_fn/std": 0.3814893066883087,
+      "step": 2076
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1843.0,
+      "completions/max_terminated_length": 1843.0,
+      "completions/mean_length": 443.20538330078125,
+      "completions/mean_terminated_length": 443.20538330078125,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.1434098529791075,
+      "grad_norm": 0.8090054392814636,
+      "kl": 0.1243896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 243135618.0,
+      "reward": 1.3790180683135986,
+      "reward_std": 0.19926463067531586,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3790178596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.3928779363632202,
+      "step": 2077
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3311.0,
+      "completions/max_terminated_length": 3311.0,
+      "completions/mean_length": 471.6339416503906,
+      "completions/mean_terminated_length": 471.6339416503906,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 2.1444415785401083,
+      "grad_norm": 0.9836584329605103,
+      "kl": 0.15234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0335,
+      "num_tokens": 243262704.0,
+      "reward": 1.5544644594192505,
+      "reward_std": 0.2018306702375412,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5544642806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.3974328339099884,
+      "step": 2078
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1575.0,
+      "completions/max_terminated_length": 1575.0,
+      "completions/mean_length": 479.8660888671875,
+      "completions/mean_terminated_length": 479.8660888671875,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.145473304101109,
+      "grad_norm": 0.7660709023475647,
+      "kl": 0.1162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 243380555.0,
+      "reward": 1.4000000953674316,
+      "reward_std": 0.2146679162979126,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4000000059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.4533161222934723,
+      "step": 2079
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1415.0,
+      "completions/max_terminated_length": 1415.0,
+      "completions/mean_length": 443.96429443359375,
+      "completions/mean_terminated_length": 443.96429443359375,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.14650502966211,
+      "grad_norm": 0.7633886337280273,
+      "kl": 0.1260986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0189,
+      "num_tokens": 243496030.0,
+      "reward": 1.364285945892334,
+      "reward_std": 0.1707383245229721,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3642857074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.3768903613090515,
+      "step": 2080
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1422.0,
+      "completions/max_terminated_length": 1422.0,
+      "completions/mean_length": 437.4196472167969,
+      "completions/mean_terminated_length": 437.4196472167969,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 2.147536755223111,
+      "grad_norm": 0.5623828172683716,
+      "kl": 0.134521484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0066,
+      "num_tokens": 243611199.0,
+      "reward": 1.46473228931427,
+      "reward_std": 0.17655536532402039,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4825893044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.46936365962028503,
+      "step": 2081
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3209.0,
+      "completions/max_terminated_length": 3209.0,
+      "completions/mean_length": 473.1607360839844,
+      "completions/mean_terminated_length": 473.1607360839844,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.1485684807841112,
+      "grad_norm": 0.5925241112709045,
+      "kl": 0.12158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0526,
+      "num_tokens": 243732247.0,
+      "reward": 1.4468752145767212,
+      "reward_std": 0.1427421271800995,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45580360293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.40955621004104614,
+      "step": 2082
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2060.0,
+      "completions/max_terminated_length": 2060.0,
+      "completions/mean_length": 524.8125,
+      "completions/mean_terminated_length": 524.8125,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 2.149600206345112,
+      "grad_norm": 0.5460736751556396,
+      "kl": 0.120849609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0012,
+      "num_tokens": 243856651.0,
+      "reward": 1.4629465341567993,
+      "reward_std": 0.17079660296440125,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47187498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.45543670654296875,
+      "step": 2083
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1341.0,
+      "completions/max_terminated_length": 1341.0,
+      "completions/mean_length": 457.45538330078125,
+      "completions/mean_terminated_length": 457.45538330078125,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 2.150631931906113,
+      "grad_norm": 0.8701239824295044,
+      "kl": 0.134765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0452,
+      "num_tokens": 243983976.0,
+      "reward": 1.4397321939468384,
+      "reward_std": 0.20743878185749054,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4397321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.39158594608306885,
+      "step": 2084
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1926.0,
+      "completions/max_terminated_length": 1926.0,
+      "completions/mean_length": 525.6607666015625,
+      "completions/mean_terminated_length": 525.6607666015625,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.1516636574671137,
+      "grad_norm": 0.616726815700531,
+      "kl": 0.1109619140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0058,
+      "num_tokens": 244113900.0,
+      "reward": 1.4772322177886963,
+      "reward_std": 0.14241279661655426,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4772321581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.4339161813259125,
+      "step": 2085
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1227.0,
+      "completions/max_terminated_length": 1227.0,
+      "completions/mean_length": 480.58038330078125,
+      "completions/mean_terminated_length": 480.58038330078125,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 2.1526953830281146,
+      "grad_norm": 0.561264157295227,
+      "kl": 0.1026611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.021,
+      "num_tokens": 244247959.0,
+      "reward": 1.6464285850524902,
+      "reward_std": 0.11487598717212677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6464285254478455,
+      "rewards/curriculum_aware_reward_fn/std": 0.43010130524635315,
+      "step": 2086
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1466.0,
+      "completions/mean_length": 542.1160888671875,
+      "completions/mean_terminated_length": 477.5,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 2.1537271085891154,
+      "grad_norm": 0.6171847581863403,
+      "kl": 0.115966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0731,
+      "num_tokens": 244379593.0,
+      "reward": 1.3107144832611084,
+      "reward_std": 0.21698839962482452,
+      "rewards/code_format_reward/mean": 0.9642857313156128,
+      "rewards/code_format_reward/std": 0.18641091883182526,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34642860293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.41196706891059875,
+      "step": 2087
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1726.0,
+      "completions/max_terminated_length": 1726.0,
+      "completions/mean_length": 444.169677734375,
+      "completions/mean_terminated_length": 444.169677734375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.1547588341501163,
+      "grad_norm": 0.7611954808235168,
+      "kl": 0.1361083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0192,
+      "num_tokens": 244496302.0,
+      "reward": 1.6562501192092896,
+      "reward_std": 0.2018815129995346,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.65625,
+      "rewards/curriculum_aware_reward_fn/std": 0.5501484274864197,
+      "step": 2088
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1449.0,
+      "completions/max_terminated_length": 1449.0,
+      "completions/mean_length": 443.1250305175781,
+      "completions/mean_terminated_length": 443.1250305175781,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.1557905597111167,
+      "grad_norm": 0.6869838833808899,
+      "kl": 0.131103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0039,
+      "num_tokens": 244612894.0,
+      "reward": 1.5941966772079468,
+      "reward_std": 0.1067938357591629,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5941963791847229,
+      "rewards/curriculum_aware_reward_fn/std": 0.41125744581222534,
+      "step": 2089
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 521.2857666015625,
+      "completions/mean_terminated_length": 456.2908935546875,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.1568222852721175,
+      "grad_norm": 0.643674910068512,
+      "kl": 0.1141357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0695,
+      "num_tokens": 244733998.0,
+      "reward": 1.3696428537368774,
+      "reward_std": 0.24212218821048737,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38750001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.42651426792144775,
+      "step": 2090
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1299.0,
+      "completions/mean_length": 517.8035888671875,
+      "completions/mean_terminated_length": 485.56756591796875,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 2.1578540108331183,
+      "grad_norm": 0.6695377230644226,
+      "kl": 0.123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0219,
+      "num_tokens": 244863798.0,
+      "reward": 1.412500023841858,
+      "reward_std": 0.2101779282093048,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42142853140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.446140319108963,
+      "step": 2091
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1477.0,
+      "completions/max_terminated_length": 1477.0,
+      "completions/mean_length": 503.0535888671875,
+      "completions/mean_terminated_length": 503.0535888671875,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.158885736394119,
+      "grad_norm": 0.7329318523406982,
+      "kl": 0.1243896484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0035,
+      "num_tokens": 244984792.0,
+      "reward": 1.4683037996292114,
+      "reward_std": 0.25523483753204346,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4772321581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.4468553066253662,
+      "step": 2092
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1368.0,
+      "completions/mean_length": 537.0625,
+      "completions/mean_terminated_length": 505.0,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 2.15991746195512,
+      "grad_norm": 0.8445482850074768,
+      "kl": 0.137939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 245118282.0,
+      "reward": 1.4089287519454956,
+      "reward_std": 0.2673889100551605,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41785717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.38630014657974243,
+      "step": 2093
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 853.0,
+      "completions/max_terminated_length": 853.0,
+      "completions/mean_length": 474.1785888671875,
+      "completions/mean_terminated_length": 474.1785888671875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.160949187516121,
+      "grad_norm": 0.693393349647522,
+      "kl": 0.1282958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.029,
+      "num_tokens": 245231415.0,
+      "reward": 1.4066966772079468,
+      "reward_std": 0.15930292010307312,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4066964089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.4089370667934418,
+      "step": 2094
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1649.0,
+      "completions/max_terminated_length": 1649.0,
+      "completions/mean_length": 474.95538330078125,
+      "completions/mean_terminated_length": 474.95538330078125,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 2.1619809130771213,
+      "grad_norm": 0.7273092865943909,
+      "kl": 0.113037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.021,
+      "num_tokens": 245348590.0,
+      "reward": 1.5553573369979858,
+      "reward_std": 0.25116923451423645,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5553571581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.4208719730377197,
+      "step": 2095
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1380.0,
+      "completions/max_terminated_length": 1380.0,
+      "completions/mean_length": 487.0357360839844,
+      "completions/mean_terminated_length": 487.0357360839844,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.163012638638122,
+      "grad_norm": 0.6591194868087769,
+      "kl": 0.1126708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 245483337.0,
+      "reward": 1.5866073369979858,
+      "reward_std": 0.13034969568252563,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5866071581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.44255515933036804,
+      "step": 2096
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1023.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 494.1964416503906,
+      "completions/mean_terminated_length": 494.1964416503906,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 2.164044364199123,
+      "grad_norm": 0.6993364691734314,
+      "kl": 0.12255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0155,
+      "num_tokens": 245601939.0,
+      "reward": 1.4361608028411865,
+      "reward_std": 0.23765063285827637,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43616071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.41817978024482727,
+      "step": 2097
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 957.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 512.8660888671875,
+      "completions/mean_terminated_length": 512.8660888671875,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.165076089760124,
+      "grad_norm": 0.7323153614997864,
+      "kl": 0.1158447265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0304,
+      "num_tokens": 245740580.0,
+      "reward": 1.3687500953674316,
+      "reward_std": 0.22040800750255585,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3687500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.40808966755867004,
+      "step": 2098
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1176.0,
+      "completions/max_terminated_length": 1176.0,
+      "completions/mean_length": 474.8839416503906,
+      "completions/mean_terminated_length": 474.8839416503906,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 2.1661078153211246,
+      "grad_norm": 0.73459392786026,
+      "kl": 0.1187744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.019,
+      "num_tokens": 245861079.0,
+      "reward": 1.4508930444717407,
+      "reward_std": 0.17947648465633392,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4508928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4086333215236664,
+      "step": 2099
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1600.0,
+      "completions/max_terminated_length": 1600.0,
+      "completions/mean_length": 486.65179443359375,
+      "completions/mean_terminated_length": 486.65179443359375,
+      "completions/min_length": 123.0,
+      "completions/min_terminated_length": 123.0,
+      "epoch": 2.1671395408821255,
+      "grad_norm": 0.7303931713104248,
+      "kl": 0.1201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0039,
+      "num_tokens": 245970877.0,
+      "reward": 1.427232265472412,
+      "reward_std": 0.23644983768463135,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42723211646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.40630659461021423,
+      "step": 2100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 978.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 442.6607360839844,
+      "completions/mean_terminated_length": 442.6607360839844,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.1681712664431263,
+      "grad_norm": 0.7605082988739014,
+      "kl": 0.1337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 246084603.0,
+      "reward": 1.3745537996292114,
+      "reward_std": 0.16059215366840363,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37455353140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.37102216482162476,
+      "step": 2101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3405.0,
+      "completions/max_terminated_length": 3405.0,
+      "completions/mean_length": 535.0714721679688,
+      "completions/mean_terminated_length": 535.0714721679688,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.1692029920041267,
+      "grad_norm": 0.6256927251815796,
+      "kl": 0.111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0116,
+      "num_tokens": 246209169.0,
+      "reward": 1.4223215579986572,
+      "reward_std": 0.18943563103675842,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42232146859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.4398934841156006,
+      "step": 2102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 928.0,
+      "completions/max_terminated_length": 928.0,
+      "completions/mean_length": 449.169677734375,
+      "completions/mean_terminated_length": 449.169677734375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 2.1702347175651275,
+      "grad_norm": 0.7536016702651978,
+      "kl": 0.1292724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0114,
+      "num_tokens": 246327936.0,
+      "reward": 1.5486608743667603,
+      "reward_std": 0.2521655261516571,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5486606955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.4142974019050598,
+      "step": 2103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1337.0,
+      "completions/max_terminated_length": 1337.0,
+      "completions/mean_length": 524.0892944335938,
+      "completions/mean_terminated_length": 524.0892944335938,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.1712664431261284,
+      "grad_norm": 0.76837557554245,
+      "kl": 0.1265869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 246455710.0,
+      "reward": 1.3549107313156128,
+      "reward_std": 0.24753983318805695,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3549107015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.40341779589653015,
+      "step": 2104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1051.0,
+      "completions/max_terminated_length": 1051.0,
+      "completions/mean_length": 430.2232360839844,
+      "completions/mean_terminated_length": 430.2232360839844,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.172298168687129,
+      "grad_norm": 0.8480135202407837,
+      "kl": 0.1304931640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0147,
+      "num_tokens": 246576317.0,
+      "reward": 1.5651787519454956,
+      "reward_std": 0.22769606113433838,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5651785731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.40829458832740784,
+      "step": 2105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 984.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 496.15179443359375,
+      "completions/mean_terminated_length": 496.15179443359375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 2.17332989424813,
+      "grad_norm": 0.65647953748703,
+      "kl": 0.126220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 246704833.0,
+      "reward": 1.4482142925262451,
+      "reward_std": 0.19437025487422943,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4571428596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.41600075364112854,
+      "step": 2106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2709.0,
+      "completions/max_terminated_length": 2709.0,
+      "completions/mean_length": 457.607177734375,
+      "completions/mean_terminated_length": 457.607177734375,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.174361619809131,
+      "grad_norm": 0.7182737588882446,
+      "kl": 0.1285400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0334,
+      "num_tokens": 246821801.0,
+      "reward": 1.4437501430511475,
+      "reward_std": 0.21361099183559418,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4437499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.42786526679992676,
+      "step": 2107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 990.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 517.0803833007812,
+      "completions/mean_terminated_length": 517.0803833007812,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 2.1753933453701317,
+      "grad_norm": 0.7584192752838135,
+      "kl": 0.11572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0328,
+      "num_tokens": 246951289.0,
+      "reward": 1.4312500953674316,
+      "reward_std": 0.2715103030204773,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4491071403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.40879857540130615,
+      "step": 2108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1060.0,
+      "completions/max_terminated_length": 1060.0,
+      "completions/mean_length": 518.375,
+      "completions/mean_terminated_length": 518.375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 2.176425070931132,
+      "grad_norm": 0.6757792830467224,
+      "kl": 0.107421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0085,
+      "num_tokens": 247076533.0,
+      "reward": 1.4361608028411865,
+      "reward_std": 0.17304961383342743,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43616074323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.43429604172706604,
+      "step": 2109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 891.0,
+      "completions/max_terminated_length": 891.0,
+      "completions/mean_length": 433.7321472167969,
+      "completions/mean_terminated_length": 433.7321472167969,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 2.177456796492133,
+      "grad_norm": 0.6956561803817749,
+      "kl": 0.1221923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0024,
+      "num_tokens": 247189917.0,
+      "reward": 1.614732265472412,
+      "reward_std": 0.12834465503692627,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6147321462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.42270249128341675,
+      "step": 2110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 940.0,
+      "completions/max_terminated_length": 940.0,
+      "completions/mean_length": 495.4910888671875,
+      "completions/mean_terminated_length": 495.4910888671875,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 2.178488522053134,
+      "grad_norm": 0.8153955340385437,
+      "kl": 0.1334228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0234,
+      "num_tokens": 247312822.0,
+      "reward": 1.3299108743667603,
+      "reward_std": 0.18136192858219147,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33883926272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.386932373046875,
+      "step": 2111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1063.0,
+      "completions/max_terminated_length": 1063.0,
+      "completions/mean_length": 502.8839416503906,
+      "completions/mean_terminated_length": 502.8839416503906,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 2.1795202476141347,
+      "grad_norm": 0.7842468023300171,
+      "kl": 0.112548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0163,
+      "num_tokens": 247432453.0,
+      "reward": 1.3258930444717407,
+      "reward_std": 0.27183881402015686,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3258928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3902518153190613,
+      "step": 2112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1342.0,
+      "completions/mean_length": 565.5625,
+      "completions/mean_terminated_length": 501.3727111816406,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 2.1805519731751355,
+      "grad_norm": 0.6904152631759644,
+      "kl": 0.1268310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 247576023.0,
+      "reward": 1.5044645071029663,
+      "reward_std": 0.1780475676059723,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5044642686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.5449420809745789,
+      "step": 2113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1255.0,
+      "completions/max_terminated_length": 1255.0,
+      "completions/mean_length": 507.0357360839844,
+      "completions/mean_terminated_length": 507.0357360839844,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 2.1815836987361363,
+      "grad_norm": 0.7472612261772156,
+      "kl": 0.1280517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0121,
+      "num_tokens": 247695181.0,
+      "reward": 1.5468751192092896,
+      "reward_std": 0.16403816640377045,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.546875,
+      "rewards/curriculum_aware_reward_fn/std": 0.4169430434703827,
+      "step": 2114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1153.0,
+      "completions/max_terminated_length": 1153.0,
+      "completions/mean_length": 506.9375305175781,
+      "completions/mean_terminated_length": 506.9375305175781,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 2.182615424297137,
+      "grad_norm": 0.7527168393135071,
+      "kl": 0.1275634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0178,
+      "num_tokens": 247822854.0,
+      "reward": 1.4482142925262451,
+      "reward_std": 0.13642320036888123,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4482142925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.4124705195426941,
+      "step": 2115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1068.0,
+      "completions/max_terminated_length": 1068.0,
+      "completions/mean_length": 501.4910888671875,
+      "completions/mean_terminated_length": 501.4910888671875,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 2.1836471498581376,
+      "grad_norm": 0.6995075345039368,
+      "kl": 0.1165771484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0018,
+      "num_tokens": 247948531.0,
+      "reward": 1.3142857551574707,
+      "reward_std": 0.1445000171661377,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3142857253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.415164589881897,
+      "step": 2116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1058.0,
+      "completions/max_terminated_length": 1058.0,
+      "completions/mean_length": 448.2589416503906,
+      "completions/mean_terminated_length": 448.2589416503906,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.1846788754191384,
+      "grad_norm": 0.62738037109375,
+      "kl": 0.12548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 248052279.0,
+      "reward": 1.5205358266830444,
+      "reward_std": 0.10798604786396027,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5205356478691101,
+      "rewards/curriculum_aware_reward_fn/std": 0.43072080612182617,
+      "step": 2117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2031.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 520.3482666015625,
+      "completions/mean_terminated_length": 520.3482666015625,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 2.1857106009801393,
+      "grad_norm": 0.773405134677887,
+      "kl": 0.13330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 248179977.0,
+      "reward": 1.5383931398391724,
+      "reward_std": 0.22789046168327332,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5473214387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.4031318426132202,
+      "step": 2118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1355.0,
+      "completions/max_terminated_length": 1355.0,
+      "completions/mean_length": 497.71429443359375,
+      "completions/mean_terminated_length": 497.71429443359375,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 2.18674232654114,
+      "grad_norm": 0.7330118417739868,
+      "kl": 0.1221923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 248307100.0,
+      "reward": 1.3544644117355347,
+      "reward_std": 0.10890157520771027,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3544642925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.4141937494277954,
+      "step": 2119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1394.0,
+      "completions/max_terminated_length": 1394.0,
+      "completions/mean_length": 497.3214416503906,
+      "completions/mean_terminated_length": 497.3214416503906,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.187774052102141,
+      "grad_norm": 0.6636401414871216,
+      "kl": 0.126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0144,
+      "num_tokens": 248424834.0,
+      "reward": 1.5687501430511475,
+      "reward_std": 0.2093847543001175,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5687499642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.44509968161582947,
+      "step": 2120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1626.0,
+      "completions/max_terminated_length": 1626.0,
+      "completions/mean_length": 479.5714416503906,
+      "completions/mean_terminated_length": 479.5714416503906,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 2.1888057776631418,
+      "grad_norm": 0.7832713723182678,
+      "kl": 0.1363525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0214,
+      "num_tokens": 248551885.0,
+      "reward": 1.5040180683135986,
+      "reward_std": 0.21539589762687683,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.4540002942085266,
+      "step": 2121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1432.0,
+      "completions/mean_length": 505.40179443359375,
+      "completions/mean_terminated_length": 473.0540771484375,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.189837503224142,
+      "grad_norm": 0.7848573327064514,
+      "kl": 0.12451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0235,
+      "num_tokens": 248674360.0,
+      "reward": 1.579464316368103,
+      "reward_std": 0.24824805557727814,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5794642567634583,
+      "rewards/curriculum_aware_reward_fn/std": 0.4694041609764099,
+      "step": 2122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1162.0,
+      "completions/max_terminated_length": 1162.0,
+      "completions/mean_length": 471.5000305175781,
+      "completions/mean_terminated_length": 471.5000305175781,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 2.190869228785143,
+      "grad_norm": 0.8156066536903381,
+      "kl": 0.1309814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0317,
+      "num_tokens": 248793339.0,
+      "reward": 1.4714287519454956,
+      "reward_std": 0.2152327597141266,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4535999298095703,
+      "step": 2123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1339.0,
+      "completions/max_terminated_length": 1339.0,
+      "completions/mean_length": 524.6607666015625,
+      "completions/mean_terminated_length": 524.6607666015625,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 2.191900954346144,
+      "grad_norm": 0.7268944382667542,
+      "kl": 0.1322021484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0159,
+      "num_tokens": 248913505.0,
+      "reward": 1.3598215579986572,
+      "reward_std": 0.18459539115428925,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35982146859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.3496999740600586,
+      "step": 2124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1046.0,
+      "completions/max_terminated_length": 1046.0,
+      "completions/mean_length": 494.2589416503906,
+      "completions/mean_terminated_length": 494.2589416503906,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 2.1929326799071447,
+      "grad_norm": 0.7613234519958496,
+      "kl": 0.12744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0322,
+      "num_tokens": 249031162.0,
+      "reward": 1.587053656578064,
+      "reward_std": 0.18938103318214417,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5870535969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.4418809711933136,
+      "step": 2125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 953.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 468.794677734375,
+      "completions/mean_terminated_length": 468.794677734375,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.1939644054681455,
+      "grad_norm": 0.5832720994949341,
+      "kl": 0.1263427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 249153464.0,
+      "reward": 1.4330357313156128,
+      "reward_std": 0.16529995203018188,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.4615657329559326,
+      "step": 2126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1110.0,
+      "completions/max_terminated_length": 1110.0,
+      "completions/mean_length": 459.1875305175781,
+      "completions/mean_terminated_length": 459.1875305175781,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 2.1949961310291464,
+      "grad_norm": 0.8431884050369263,
+      "kl": 0.1591796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0056,
+      "num_tokens": 249269776.0,
+      "reward": 1.4276785850524902,
+      "reward_std": 0.23011423647403717,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.3886904716491699,
+      "step": 2127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1473.0,
+      "completions/max_terminated_length": 1473.0,
+      "completions/mean_length": 477.544677734375,
+      "completions/mean_terminated_length": 477.544677734375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.196027856590147,
+      "grad_norm": 0.7806416153907776,
+      "kl": 0.131591796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0208,
+      "num_tokens": 249375516.0,
+      "reward": 1.5459822416305542,
+      "reward_std": 0.24724382162094116,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5459821820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.4110598564147949,
+      "step": 2128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1181.0,
+      "completions/max_terminated_length": 1181.0,
+      "completions/mean_length": 492.0714416503906,
+      "completions/mean_terminated_length": 492.0714416503906,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 2.1970595821511476,
+      "grad_norm": 0.6576789617538452,
+      "kl": 0.1273193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 249502882.0,
+      "reward": 1.3633930683135986,
+      "reward_std": 0.20141588151454926,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3633928596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.39215174317359924,
+      "step": 2129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2546.0,
+      "completions/max_terminated_length": 2546.0,
+      "completions/mean_length": 438.45538330078125,
+      "completions/mean_terminated_length": 438.45538330078125,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 2.1980913077121484,
+      "grad_norm": 0.7464306950569153,
+      "kl": 0.135009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 249616298.0,
+      "reward": 1.6316964626312256,
+      "reward_std": 0.1531204730272293,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6316964030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.3808475434780121,
+      "step": 2130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1904.0,
+      "completions/max_terminated_length": 1904.0,
+      "completions/mean_length": 499.982177734375,
+      "completions/mean_terminated_length": 499.982177734375,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 2.1991230332731493,
+      "grad_norm": 0.6995850801467896,
+      "kl": 0.12451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0219,
+      "num_tokens": 249740315.0,
+      "reward": 1.4102680683135986,
+      "reward_std": 0.11930114775896072,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41919639706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.3942860960960388,
+      "step": 2131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1609.0,
+      "completions/max_terminated_length": 1609.0,
+      "completions/mean_length": 517.2232666015625,
+      "completions/mean_terminated_length": 517.2232666015625,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.20015475883415,
+      "grad_norm": 0.8047731518745422,
+      "kl": 0.1268310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0307,
+      "num_tokens": 249867908.0,
+      "reward": 1.4915179014205933,
+      "reward_std": 0.198326975107193,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4915178418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4356773793697357,
+      "step": 2132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1301.0,
+      "completions/max_terminated_length": 1301.0,
+      "completions/mean_length": 463.669677734375,
+      "completions/mean_terminated_length": 463.669677734375,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 2.201186484395151,
+      "grad_norm": 0.6989313364028931,
+      "kl": 0.1282958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0222,
+      "num_tokens": 249983266.0,
+      "reward": 1.5669643878936768,
+      "reward_std": 0.19032026827335358,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5669642686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.42846643924713135,
+      "step": 2133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1163.0,
+      "completions/max_terminated_length": 1163.0,
+      "completions/mean_length": 484.51788330078125,
+      "completions/mean_terminated_length": 484.51788330078125,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.202218209956152,
+      "grad_norm": 0.843909740447998,
+      "kl": 0.128662109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0428,
+      "num_tokens": 250114959.0,
+      "reward": 1.503571629524231,
+      "reward_std": 0.24994143843650818,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5035714507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.39322608709335327,
+      "step": 2134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 967.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 428.2946472167969,
+      "completions/mean_terminated_length": 428.2946472167969,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.203249935517152,
+      "grad_norm": 0.7581227421760559,
+      "kl": 0.1234130859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0007,
+      "num_tokens": 250227965.0,
+      "reward": 1.506250023841858,
+      "reward_std": 0.16399899125099182,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5062500238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.42288821935653687,
+      "step": 2135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2759.0,
+      "completions/max_terminated_length": 2759.0,
+      "completions/mean_length": 402.4285888671875,
+      "completions/mean_terminated_length": 402.4285888671875,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 2.204281661078153,
+      "grad_norm": 0.8055557608604431,
+      "kl": 0.1396484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0158,
+      "num_tokens": 250329611.0,
+      "reward": 1.6191965341567993,
+      "reward_std": 0.16283194720745087,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6191964149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.4231076240539551,
+      "step": 2136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1142.0,
+      "completions/max_terminated_length": 1142.0,
+      "completions/mean_length": 421.26788330078125,
+      "completions/mean_terminated_length": 421.26788330078125,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 2.205313386639154,
+      "grad_norm": 0.7206082344055176,
+      "kl": 0.1309814453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0022,
+      "num_tokens": 250444441.0,
+      "reward": 1.4982143640518188,
+      "reward_std": 0.19586418569087982,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.4536105692386627,
+      "step": 2137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1214.0,
+      "completions/max_terminated_length": 1214.0,
+      "completions/mean_length": 429.1339416503906,
+      "completions/mean_terminated_length": 429.1339416503906,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.2063451122001547,
+      "grad_norm": 0.782223105430603,
+      "kl": 0.133056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0241,
+      "num_tokens": 250559550.0,
+      "reward": 1.3928571939468384,
+      "reward_std": 0.1982770413160324,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3928571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.42244765162467957,
+      "step": 2138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1527.0,
+      "completions/max_terminated_length": 1527.0,
+      "completions/mean_length": 505.8482360839844,
+      "completions/mean_terminated_length": 505.8482360839844,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 2.2073768377611556,
+      "grad_norm": 0.6624898314476013,
+      "kl": 0.117919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 250687731.0,
+      "reward": 1.3897321224212646,
+      "reward_std": 0.1644437313079834,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38973215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.35349851846694946,
+      "step": 2139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1354.0,
+      "completions/max_terminated_length": 1354.0,
+      "completions/mean_length": 433.89288330078125,
+      "completions/mean_terminated_length": 433.89288330078125,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 2.2084085633221564,
+      "grad_norm": 0.7602798342704773,
+      "kl": 0.14306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0429,
+      "num_tokens": 250807678.0,
+      "reward": 1.5544644594192505,
+      "reward_std": 0.20387950539588928,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5633928775787354,
+      "rewards/curriculum_aware_reward_fn/std": 0.3997173607349396,
+      "step": 2140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1214.0,
+      "completions/mean_length": 518.857177734375,
+      "completions/mean_terminated_length": 486.6306457519531,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 2.2094402888831572,
+      "grad_norm": 0.7733542323112488,
+      "kl": 0.1219482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0383,
+      "num_tokens": 250937253.0,
+      "reward": 1.4982143640518188,
+      "reward_std": 0.21743346750736237,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3919762969017029,
+      "step": 2141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1021.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 446.15179443359375,
+      "completions/mean_terminated_length": 446.15179443359375,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 2.210472014444158,
+      "grad_norm": 0.6671974658966064,
+      "kl": 0.11962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0186,
+      "num_tokens": 251060862.0,
+      "reward": 1.5214287042617798,
+      "reward_std": 0.14280013740062714,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5214285850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.44472455978393555,
+      "step": 2142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 766.0,
+      "completions/max_terminated_length": 766.0,
+      "completions/mean_length": 456.5535888671875,
+      "completions/mean_terminated_length": 456.5535888671875,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 2.2115037400051585,
+      "grad_norm": 0.775130033493042,
+      "kl": 0.1229248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 251177817.0,
+      "reward": 1.2549108266830444,
+      "reward_std": 0.15199097990989685,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25491073727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.31192225217819214,
+      "step": 2143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 902.0,
+      "completions/max_terminated_length": 902.0,
+      "completions/mean_length": 431.52679443359375,
+      "completions/mean_terminated_length": 431.52679443359375,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 2.2125354655661593,
+      "grad_norm": 0.7285803556442261,
+      "kl": 0.12939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 251297632.0,
+      "reward": 1.3763394355773926,
+      "reward_std": 0.15684416890144348,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37633928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4005041718482971,
+      "step": 2144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1134.0,
+      "completions/max_terminated_length": 1134.0,
+      "completions/mean_length": 439.3482360839844,
+      "completions/mean_terminated_length": 439.3482360839844,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 2.21356719112716,
+      "grad_norm": 0.7767670750617981,
+      "kl": 0.111328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0239,
+      "num_tokens": 251408180.0,
+      "reward": 1.6044644117355347,
+      "reward_std": 0.17807038128376007,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6044642329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.4751269817352295,
+      "step": 2145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1101.0,
+      "completions/max_terminated_length": 1101.0,
+      "completions/mean_length": 449.3214416503906,
+      "completions/mean_terminated_length": 449.3214416503906,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.214598916688161,
+      "grad_norm": 0.5723203420639038,
+      "kl": 0.107421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0028,
+      "num_tokens": 251520461.0,
+      "reward": 1.5924108028411865,
+      "reward_std": 0.1818428784608841,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.592410683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.45292890071868896,
+      "step": 2146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 784.0,
+      "completions/max_terminated_length": 784.0,
+      "completions/mean_length": 425.2857360839844,
+      "completions/mean_terminated_length": 425.2857360839844,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 2.215630642249162,
+      "grad_norm": 0.7234082221984863,
+      "kl": 0.111083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 251623372.0,
+      "reward": 1.5941965579986572,
+      "reward_std": 0.2118162214756012,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5941964387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.4257872998714447,
+      "step": 2147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1098.0,
+      "completions/max_terminated_length": 1098.0,
+      "completions/mean_length": 459.5357360839844,
+      "completions/mean_terminated_length": 459.5357360839844,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.2166623678101627,
+      "grad_norm": 0.8283867835998535,
+      "kl": 0.119384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 251744980.0,
+      "reward": 1.4450894594192505,
+      "reward_std": 0.19101282954216003,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.40614426136016846,
+      "step": 2148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 863.0,
+      "completions/max_terminated_length": 863.0,
+      "completions/mean_length": 435.70538330078125,
+      "completions/mean_terminated_length": 435.70538330078125,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 2.217694093371163,
+      "grad_norm": 0.7950035929679871,
+      "kl": 0.13330078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 251868350.0,
+      "reward": 1.3888393640518188,
+      "reward_std": 0.1650308221578598,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3977678716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.4264027178287506,
+      "step": 2149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1430.0,
+      "completions/max_terminated_length": 1430.0,
+      "completions/mean_length": 428.0357360839844,
+      "completions/mean_terminated_length": 428.0357360839844,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 2.218725818932164,
+      "grad_norm": 0.8781709671020508,
+      "kl": 0.1246337890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0176,
+      "num_tokens": 251989559.0,
+      "reward": 1.3375002145767212,
+      "reward_std": 0.1628822684288025,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33750003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.36052390933036804,
+      "step": 2150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1137.0,
+      "completions/max_terminated_length": 1137.0,
+      "completions/mean_length": 433.7321472167969,
+      "completions/mean_terminated_length": 433.7321472167969,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.2197575444931648,
+      "grad_norm": 0.6704356074333191,
+      "kl": 0.1156005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0339,
+      "num_tokens": 252100024.0,
+      "reward": 1.4651787281036377,
+      "reward_std": 0.16542240977287292,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46517857909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.46128684282302856,
+      "step": 2151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1097.0,
+      "completions/max_terminated_length": 1097.0,
+      "completions/mean_length": 434.4107360839844,
+      "completions/mean_terminated_length": 434.4107360839844,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.2207892700541656,
+      "grad_norm": 0.7110837697982788,
+      "kl": 0.124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 252218531.0,
+      "reward": 1.5674108266830444,
+      "reward_std": 0.20352497696876526,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5763392448425293,
+      "rewards/curriculum_aware_reward_fn/std": 0.5065765380859375,
+      "step": 2152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1754.0,
+      "completions/max_terminated_length": 1754.0,
+      "completions/mean_length": 461.52679443359375,
+      "completions/mean_terminated_length": 461.52679443359375,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.2218209956151664,
+      "grad_norm": 0.7591835856437683,
+      "kl": 0.1285400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0204,
+      "num_tokens": 252339312.0,
+      "reward": 1.4964287281036377,
+      "reward_std": 0.20607492327690125,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49642854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.42620861530303955,
+      "step": 2153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 861.0,
+      "completions/max_terminated_length": 861.0,
+      "completions/mean_length": 406.9375305175781,
+      "completions/mean_terminated_length": 406.9375305175781,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 2.2228527211761673,
+      "grad_norm": 0.8121379017829895,
+      "kl": 0.1046142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0271,
+      "num_tokens": 252447497.0,
+      "reward": 1.6383929252624512,
+      "reward_std": 0.19473080337047577,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6383928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.38291096687316895,
+      "step": 2154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 999.0,
+      "completions/max_terminated_length": 999.0,
+      "completions/mean_length": 403.9375305175781,
+      "completions/mean_terminated_length": 403.9375305175781,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 2.223884446737168,
+      "grad_norm": 0.7377510666847229,
+      "kl": 0.11181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0388,
+      "num_tokens": 252547779.0,
+      "reward": 1.697767972946167,
+      "reward_std": 0.22175738215446472,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6977678537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.4093715250492096,
+      "step": 2155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 405.4821472167969,
+      "completions/mean_terminated_length": 405.4821472167969,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 2.2249161722981685,
+      "grad_norm": 0.7825002074241638,
+      "kl": 0.116455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0099,
+      "num_tokens": 252653400.0,
+      "reward": 1.5647321939468384,
+      "reward_std": 0.14512257277965546,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5647321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.44263941049575806,
+      "step": 2156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 983.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 481.6875305175781,
+      "completions/mean_terminated_length": 481.6875305175781,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 2.2259478978591694,
+      "grad_norm": 0.7722179293632507,
+      "kl": 0.1181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0048,
+      "num_tokens": 252777111.0,
+      "reward": 1.4276787042617798,
+      "reward_std": 0.19991116225719452,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42767855525016785,
+      "rewards/curriculum_aware_reward_fn/std": 0.4300030767917633,
+      "step": 2157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 407.8660888671875,
+      "completions/mean_terminated_length": 407.8660888671875,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 2.22697962342017,
+      "grad_norm": 0.6580060720443726,
+      "kl": 0.1263427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0075,
+      "num_tokens": 252883650.0,
+      "reward": 1.6477681398391724,
+      "reward_std": 0.15886934101581573,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6477679014205933,
+      "rewards/curriculum_aware_reward_fn/std": 0.4077177047729492,
+      "step": 2158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1382.0,
+      "completions/max_terminated_length": 1382.0,
+      "completions/mean_length": 431.46429443359375,
+      "completions/mean_terminated_length": 431.46429443359375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.228011348981171,
+      "grad_norm": 0.7175585627555847,
+      "kl": 0.1387939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 253004984.0,
+      "reward": 1.4406250715255737,
+      "reward_std": 0.21226482093334198,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.4401318430900574,
+      "step": 2159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 831.0,
+      "completions/max_terminated_length": 831.0,
+      "completions/mean_length": 439.4107360839844,
+      "completions/mean_terminated_length": 439.4107360839844,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 2.229043074542172,
+      "grad_norm": 0.6940580010414124,
+      "kl": 0.114501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 253120782.0,
+      "reward": 1.5446430444717407,
+      "reward_std": 0.16438443958759308,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5446428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.44084057211875916,
+      "step": 2160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 795.0,
+      "completions/max_terminated_length": 795.0,
+      "completions/mean_length": 420.9375305175781,
+      "completions/mean_terminated_length": 420.9375305175781,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.2300748001031727,
+      "grad_norm": 0.9348301291465759,
+      "kl": 0.1226806640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0145,
+      "num_tokens": 253236324.0,
+      "reward": 1.5491071939468384,
+      "reward_std": 0.2583702802658081,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5491071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4151403605937958,
+      "step": 2161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1091.0,
+      "completions/max_terminated_length": 1091.0,
+      "completions/mean_length": 459.357177734375,
+      "completions/mean_terminated_length": 459.357177734375,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 2.231106525664173,
+      "grad_norm": 0.7177847623825073,
+      "kl": 0.11181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0065,
+      "num_tokens": 253359088.0,
+      "reward": 1.4714287519454956,
+      "reward_std": 0.16225051879882812,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4265255630016327,
+      "step": 2162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1367.0,
+      "completions/max_terminated_length": 1367.0,
+      "completions/mean_length": 503.107177734375,
+      "completions/mean_terminated_length": 503.107177734375,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 2.232138251225174,
+      "grad_norm": 0.6663182973861694,
+      "kl": 0.1199951171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0106,
+      "num_tokens": 253485198.0,
+      "reward": 1.4745537042617798,
+      "reward_std": 0.14503777027130127,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47455355525016785,
+      "rewards/curriculum_aware_reward_fn/std": 0.4369218349456787,
+      "step": 2163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1002.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 443.3214416503906,
+      "completions/mean_terminated_length": 443.3214416503906,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 2.233169976786175,
+      "grad_norm": 0.6235113739967346,
+      "kl": 0.1243896484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0142,
+      "num_tokens": 253607300.0,
+      "reward": 1.5187500715255737,
+      "reward_std": 0.14643462002277374,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.518750011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.43903684616088867,
+      "step": 2164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1213.0,
+      "completions/max_terminated_length": 1213.0,
+      "completions/mean_length": 445.9107360839844,
+      "completions/mean_terminated_length": 445.9107360839844,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 2.2342017023471756,
+      "grad_norm": 0.8566228151321411,
+      "kl": 0.124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0479,
+      "num_tokens": 253725534.0,
+      "reward": 1.519196629524231,
+      "reward_std": 0.2857770025730133,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5191964507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.4281347095966339,
+      "step": 2165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1362.0,
+      "completions/max_terminated_length": 1362.0,
+      "completions/mean_length": 487.0625305175781,
+      "completions/mean_terminated_length": 487.0625305175781,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.2352334279081765,
+      "grad_norm": 0.623132050037384,
+      "kl": 0.1077880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 253851527.0,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.15566569566726685,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38705354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.4219082295894623,
+      "step": 2166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1244.0,
+      "completions/max_terminated_length": 1244.0,
+      "completions/mean_length": 507.3214416503906,
+      "completions/mean_terminated_length": 507.3214416503906,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 2.2362651534691773,
+      "grad_norm": 0.7437611222267151,
+      "kl": 0.121826171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0311,
+      "num_tokens": 253976660.0,
+      "reward": 1.4370537996292114,
+      "reward_std": 0.24500703811645508,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43705353140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.40977612137794495,
+      "step": 2167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1385.0,
+      "completions/max_terminated_length": 1385.0,
+      "completions/mean_length": 424.27679443359375,
+      "completions/mean_terminated_length": 424.27679443359375,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 2.237296879030178,
+      "grad_norm": 0.7895928025245667,
+      "kl": 0.129150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 254094991.0,
+      "reward": 1.469642996788025,
+      "reward_std": 0.14035160839557648,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.37622812390327454,
+      "step": 2168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 397.4285888671875,
+      "completions/mean_terminated_length": 397.4285888671875,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 2.2383286045911786,
+      "grad_norm": 0.8234912753105164,
+      "kl": 0.1220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 254208824.0,
+      "reward": 1.5513393878936768,
+      "reward_std": 0.23913827538490295,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5513392686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.45665547251701355,
+      "step": 2169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1095.0,
+      "completions/max_terminated_length": 1095.0,
+      "completions/mean_length": 444.4375305175781,
+      "completions/mean_terminated_length": 444.4375305175781,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 2.2393603301521794,
+      "grad_norm": 0.7604856491088867,
+      "kl": 0.110107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0189,
+      "num_tokens": 254331324.0,
+      "reward": 1.5223214626312256,
+      "reward_std": 0.24196383357048035,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5223214030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.44816136360168457,
+      "step": 2170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1074.0,
+      "completions/max_terminated_length": 1074.0,
+      "completions/mean_length": 463.8482360839844,
+      "completions/mean_terminated_length": 463.8482360839844,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 2.2403920557131802,
+      "grad_norm": 0.7959080934524536,
+      "kl": 0.127197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0204,
+      "num_tokens": 254455319.0,
+      "reward": 1.4455358982086182,
+      "reward_std": 0.1944299191236496,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44553568959236145,
+      "rewards/curriculum_aware_reward_fn/std": 0.4174436330795288,
+      "step": 2171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1652.0,
+      "completions/max_terminated_length": 1652.0,
+      "completions/mean_length": 456.857177734375,
+      "completions/mean_terminated_length": 456.857177734375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 2.241423781274181,
+      "grad_norm": 0.7582790851593018,
+      "kl": 0.1181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0109,
+      "num_tokens": 254581583.0,
+      "reward": 1.4687501192092896,
+      "reward_std": 0.2497893124818802,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46875,
+      "rewards/curriculum_aware_reward_fn/std": 0.43184730410575867,
+      "step": 2172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1748.0,
+      "completions/max_terminated_length": 1748.0,
+      "completions/mean_length": 523.8482666015625,
+      "completions/mean_terminated_length": 523.8482666015625,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 2.242455506835182,
+      "grad_norm": 0.6578783988952637,
+      "kl": 0.112548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 254707776.0,
+      "reward": 1.4723217487335205,
+      "reward_std": 0.20645909011363983,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48125001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.4099617898464203,
+      "step": 2173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1240.0,
+      "completions/max_terminated_length": 1240.0,
+      "completions/mean_length": 471.40179443359375,
+      "completions/mean_terminated_length": 471.40179443359375,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 2.2434872323961828,
+      "grad_norm": 0.6668171286582947,
+      "kl": 0.1129150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0027,
+      "num_tokens": 254822362.0,
+      "reward": 1.4473215341567993,
+      "reward_std": 0.12001083791255951,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44732141494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.4498167335987091,
+      "step": 2174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 896.0,
+      "completions/max_terminated_length": 896.0,
+      "completions/mean_length": 433.3482360839844,
+      "completions/mean_terminated_length": 433.3482360839844,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 2.2445189579571836,
+      "grad_norm": 0.7935535907745361,
+      "kl": 0.130615234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 254932119.0,
+      "reward": 1.4691966772079468,
+      "reward_std": 0.15878739953041077,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4691964089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.4048270583152771,
+      "step": 2175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 721.0,
+      "completions/max_terminated_length": 721.0,
+      "completions/mean_length": 403.6785888671875,
+      "completions/mean_terminated_length": 403.6785888671875,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 2.245550683518184,
+      "grad_norm": 0.86411452293396,
+      "kl": 0.1263427734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0071,
+      "num_tokens": 255045637.0,
+      "reward": 1.6334823369979858,
+      "reward_std": 0.22985121607780457,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6334820985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.42966508865356445,
+      "step": 2176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 830.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 443.1339416503906,
+      "completions/mean_terminated_length": 443.1339416503906,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 2.246582409079185,
+      "grad_norm": 0.7280702590942383,
+      "kl": 0.1129150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0036,
+      "num_tokens": 255167816.0,
+      "reward": 1.469642996788025,
+      "reward_std": 0.18711194396018982,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.42460137605667114,
+      "step": 2177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1205.0,
+      "completions/max_terminated_length": 1205.0,
+      "completions/mean_length": 422.14288330078125,
+      "completions/mean_terminated_length": 422.14288330078125,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 2.2476141346401857,
+      "grad_norm": 0.7460468411445618,
+      "kl": 0.1173095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 255278390.0,
+      "reward": 1.6116071939468384,
+      "reward_std": 0.16963741183280945,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6116071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.42058423161506653,
+      "step": 2178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1267.0,
+      "completions/max_terminated_length": 1267.0,
+      "completions/mean_length": 414.2410888671875,
+      "completions/mean_terminated_length": 414.2410888671875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 2.2486458602011865,
+      "grad_norm": 0.7804027795791626,
+      "kl": 0.1292724609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0025,
+      "num_tokens": 255389374.0,
+      "reward": 1.5683037042617798,
+      "reward_std": 0.23866964876651764,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5772321820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.42584776878356934,
+      "step": 2179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 881.0,
+      "completions/max_terminated_length": 881.0,
+      "completions/mean_length": 413.4107360839844,
+      "completions/mean_terminated_length": 413.4107360839844,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.2496775857621873,
+      "grad_norm": 0.8622057437896729,
+      "kl": 0.124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 255506852.0,
+      "reward": 1.557142972946167,
+      "reward_std": 0.22154921293258667,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5571428537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.4266915023326874,
+      "step": 2180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 435.1964416503906,
+      "completions/mean_terminated_length": 435.1964416503906,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.250709311323188,
+      "grad_norm": 0.7885429859161377,
+      "kl": 0.1177978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 255615038.0,
+      "reward": 1.4687501192092896,
+      "reward_std": 0.24748605489730835,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4866071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.40752947330474854,
+      "step": 2181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 800.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 408.9196472167969,
+      "completions/mean_terminated_length": 408.9196472167969,
+      "completions/min_length": 113.0,
+      "completions/min_terminated_length": 113.0,
+      "epoch": 2.251741036884189,
+      "grad_norm": 0.655225932598114,
+      "kl": 0.1143798828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0036,
+      "num_tokens": 255736395.0,
+      "reward": 1.4709821939468384,
+      "reward_std": 0.16663986444473267,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4799107611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.43493086099624634,
+      "step": 2182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1210.0,
+      "completions/max_terminated_length": 1210.0,
+      "completions/mean_length": 468.01788330078125,
+      "completions/mean_terminated_length": 468.01788330078125,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 2.2527727624451894,
+      "grad_norm": 0.6557589769363403,
+      "kl": 0.12060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 255866037.0,
+      "reward": 1.3339285850524902,
+      "reward_std": 0.17304901778697968,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34285715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.4116467535495758,
+      "step": 2183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 891.0,
+      "completions/max_terminated_length": 891.0,
+      "completions/mean_length": 448.8214416503906,
+      "completions/mean_terminated_length": 448.8214416503906,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.2538044880061903,
+      "grad_norm": 0.7125776410102844,
+      "kl": 0.1163330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0138,
+      "num_tokens": 255976505.0,
+      "reward": 1.4607144594192505,
+      "reward_std": 0.23142556846141815,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.43704357743263245,
+      "step": 2184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1157.0,
+      "completions/max_terminated_length": 1157.0,
+      "completions/mean_length": 487.7232360839844,
+      "completions/mean_terminated_length": 487.7232360839844,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 2.254836213567191,
+      "grad_norm": 0.7252135872840881,
+      "kl": 0.1541748046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 256106863.0,
+      "reward": 1.5406252145767212,
+      "reward_std": 0.25469622015953064,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5406250357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.4111361503601074,
+      "step": 2185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 981.0,
+      "completions/max_terminated_length": 981.0,
+      "completions/mean_length": 450.482177734375,
+      "completions/mean_terminated_length": 450.482177734375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.255867939128192,
+      "grad_norm": 0.6247652173042297,
+      "kl": 0.1253662109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0035,
+      "num_tokens": 256232098.0,
+      "reward": 1.5013394355773926,
+      "reward_std": 0.15615127980709076,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.501339316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.4447116255760193,
+      "step": 2186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1319.0,
+      "completions/max_terminated_length": 1319.0,
+      "completions/mean_length": 478.8482360839844,
+      "completions/mean_terminated_length": 478.8482360839844,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.256899664689193,
+      "grad_norm": 0.7280486226081848,
+      "kl": 0.1190185546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0309,
+      "num_tokens": 256359453.0,
+      "reward": 1.4285714626312256,
+      "reward_std": 0.183299720287323,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4285714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.37088367342948914,
+      "step": 2187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 809.0,
+      "completions/max_terminated_length": 809.0,
+      "completions/mean_length": 434.6160888671875,
+      "completions/mean_terminated_length": 434.6160888671875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.2579313902501936,
+      "grad_norm": 0.6480599045753479,
+      "kl": 0.1168212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0122,
+      "num_tokens": 256469289.0,
+      "reward": 1.4513393640518188,
+      "reward_std": 0.16868485510349274,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4513393044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.45074811577796936,
+      "step": 2188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1130.0,
+      "completions/max_terminated_length": 1130.0,
+      "completions/mean_length": 455.71429443359375,
+      "completions/mean_terminated_length": 455.71429443359375,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 2.258963115811194,
+      "grad_norm": 0.8033361434936523,
+      "kl": 0.12060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0046,
+      "num_tokens": 256583854.0,
+      "reward": 1.6004464626312256,
+      "reward_std": 0.21607355773448944,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6004464030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.38152700662612915,
+      "step": 2189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1203.0,
+      "completions/max_terminated_length": 1203.0,
+      "completions/mean_length": 471.0535888671875,
+      "completions/mean_terminated_length": 471.0535888671875,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "epoch": 2.259994841372195,
+      "grad_norm": 0.7653167843818665,
+      "kl": 0.122314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0036,
+      "num_tokens": 256713283.0,
+      "reward": 1.4058037996292114,
+      "reward_std": 0.23669905960559845,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40580353140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.410709410905838,
+      "step": 2190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 784.0,
+      "completions/max_terminated_length": 784.0,
+      "completions/mean_length": 424.14288330078125,
+      "completions/mean_terminated_length": 424.14288330078125,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 2.2610265669331957,
+      "grad_norm": 0.8268042206764221,
+      "kl": 0.131103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0579,
+      "num_tokens": 256818628.0,
+      "reward": 1.549553632736206,
+      "reward_std": 0.23739364743232727,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5495535731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.430665522813797,
+      "step": 2191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1373.0,
+      "completions/max_terminated_length": 1373.0,
+      "completions/mean_length": 431.9821472167969,
+      "completions/mean_terminated_length": 431.9821472167969,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 2.2620582924941965,
+      "grad_norm": 0.6653684973716736,
+      "kl": 0.1309814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0022,
+      "num_tokens": 256931529.0,
+      "reward": 1.5334821939468384,
+      "reward_std": 0.173820361495018,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4615001678466797,
+      "step": 2192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1228.0,
+      "completions/max_terminated_length": 1228.0,
+      "completions/mean_length": 418.3125305175781,
+      "completions/mean_terminated_length": 418.3125305175781,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 2.2630900180551974,
+      "grad_norm": 0.7461981773376465,
+      "kl": 0.132080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0204,
+      "num_tokens": 257047302.0,
+      "reward": 1.6566966772079468,
+      "reward_std": 0.20143945515155792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6566964387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.5511043667793274,
+      "step": 2193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 871.0,
+      "completions/max_terminated_length": 871.0,
+      "completions/mean_length": 416.5535888671875,
+      "completions/mean_terminated_length": 416.5535888671875,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 2.2641217436161982,
+      "grad_norm": 0.7070055603981018,
+      "kl": 0.12548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 257169885.0,
+      "reward": 1.5910714864730835,
+      "reward_std": 0.12923413515090942,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.44010645151138306,
+      "step": 2194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1140.0,
+      "completions/max_terminated_length": 1140.0,
+      "completions/mean_length": 498.5535888671875,
+      "completions/mean_terminated_length": 498.5535888671875,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.265153469177199,
+      "grad_norm": 0.7625406384468079,
+      "kl": 0.1231689453125,
+      "learning_rate": 1e-06,
+      "loss": -0.024,
+      "num_tokens": 257297620.0,
+      "reward": 1.3973215818405151,
+      "reward_std": 0.25870275497436523,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3973214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.39070501923561096,
+      "step": 2195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 994.0,
+      "completions/max_terminated_length": 994.0,
+      "completions/mean_length": 468.6964416503906,
+      "completions/mean_terminated_length": 468.6964416503906,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.2661851947381995,
+      "grad_norm": 0.8173531293869019,
+      "kl": 0.1103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.003,
+      "num_tokens": 257423803.0,
+      "reward": 1.415178656578064,
+      "reward_std": 0.2854968011379242,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4241071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.40929415822029114,
+      "step": 2196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 806.0,
+      "completions/max_terminated_length": 806.0,
+      "completions/mean_length": 424.5000305175781,
+      "completions/mean_terminated_length": 424.5000305175781,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 2.2672169202992003,
+      "grad_norm": 0.8341377973556519,
+      "kl": 0.124755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 257530882.0,
+      "reward": 1.5250000953674316,
+      "reward_std": 0.14234140515327454,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5250000357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.38414525985717773,
+      "step": 2197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1062.0,
+      "completions/max_terminated_length": 1062.0,
+      "completions/mean_length": 479.26788330078125,
+      "completions/mean_terminated_length": 479.26788330078125,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 2.268248645860201,
+      "grad_norm": 0.7552391290664673,
+      "kl": 0.125244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0356,
+      "num_tokens": 257657598.0,
+      "reward": 1.3946430683135986,
+      "reward_std": 0.18035611510276794,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4035714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.3959658443927765,
+      "step": 2198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 996.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 402.26788330078125,
+      "completions/mean_terminated_length": 402.26788330078125,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 2.269280371421202,
+      "grad_norm": 0.8114144802093506,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0141,
+      "num_tokens": 257757189.0,
+      "reward": 1.4589285850524902,
+      "reward_std": 0.22217056155204773,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45892858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.40560346841812134,
+      "step": 2199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1441.0,
+      "completions/max_terminated_length": 1441.0,
+      "completions/mean_length": 511.482177734375,
+      "completions/mean_terminated_length": 511.482177734375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 2.270312096982203,
+      "grad_norm": 0.6917448043823242,
+      "kl": 0.1060791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0047,
+      "num_tokens": 257880723.0,
+      "reward": 1.3986608982086182,
+      "reward_std": 0.2185220867395401,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39866071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3975975215435028,
+      "step": 2200
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 767.0,
+      "completions/max_terminated_length": 767.0,
+      "completions/mean_length": 369.2500305175781,
+      "completions/mean_terminated_length": 369.2500305175781,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.2713438225432037,
+      "grad_norm": 0.6788763999938965,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0093,
+      "num_tokens": 257983952.0,
+      "reward": 1.6758930683135986,
+      "reward_std": 0.12376383692026138,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6758928894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.3803141415119171,
+      "step": 2201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1203.0,
+      "completions/max_terminated_length": 1203.0,
+      "completions/mean_length": 467.2232360839844,
+      "completions/mean_terminated_length": 467.2232360839844,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 2.272375548104204,
+      "grad_norm": 0.7781023383140564,
+      "kl": 0.113037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 258108059.0,
+      "reward": 1.4794644117355347,
+      "reward_std": 0.19456489384174347,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4794642925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.3801110088825226,
+      "step": 2202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 414.8839416503906,
+      "completions/mean_terminated_length": 414.8839416503906,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 2.273407273665205,
+      "grad_norm": 0.7982087135314941,
+      "kl": 0.1376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0288,
+      "num_tokens": 258222489.0,
+      "reward": 1.4321428537368774,
+      "reward_std": 0.2655171751976013,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43214288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.4219675660133362,
+      "step": 2203
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1294.0,
+      "completions/max_terminated_length": 1294.0,
+      "completions/mean_length": 486.33038330078125,
+      "completions/mean_terminated_length": 486.33038330078125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 2.2744389992262057,
+      "grad_norm": 0.8439200520515442,
+      "kl": 0.108642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0346,
+      "num_tokens": 258344165.0,
+      "reward": 1.5200893878936768,
+      "reward_std": 0.23756206035614014,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5200892686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.3993496894836426,
+      "step": 2204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1247.0,
+      "completions/max_terminated_length": 1247.0,
+      "completions/mean_length": 424.8660888671875,
+      "completions/mean_terminated_length": 424.8660888671875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.2754707247872066,
+      "grad_norm": 0.6224795579910278,
+      "kl": 0.12744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0252,
+      "num_tokens": 258453021.0,
+      "reward": 1.5691964626312256,
+      "reward_std": 0.10045679658651352,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5691964030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.4354890286922455,
+      "step": 2205
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 902.0,
+      "completions/max_terminated_length": 902.0,
+      "completions/mean_length": 474.7232360839844,
+      "completions/mean_terminated_length": 474.7232360839844,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.2765024503482074,
+      "grad_norm": 0.8304800987243652,
+      "kl": 0.12255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 258578773.0,
+      "reward": 1.497321605682373,
+      "reward_std": 0.19776052236557007,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4973214268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.37565794587135315,
+      "step": 2206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 426.9910888671875,
+      "completions/mean_terminated_length": 426.9910888671875,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 2.2775341759092083,
+      "grad_norm": 0.7672892808914185,
+      "kl": 0.134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0061,
+      "num_tokens": 258701433.0,
+      "reward": 1.6191965341567993,
+      "reward_std": 0.23367580771446228,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6191964745521545,
+      "rewards/curriculum_aware_reward_fn/std": 0.439196914434433,
+      "step": 2207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 845.0,
+      "completions/max_terminated_length": 845.0,
+      "completions/mean_length": 445.1160888671875,
+      "completions/mean_terminated_length": 445.1160888671875,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 2.278565901470209,
+      "grad_norm": 0.7751359343528748,
+      "kl": 0.13525390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0016,
+      "num_tokens": 258827628.0,
+      "reward": 1.5513393878936768,
+      "reward_std": 0.29002559185028076,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5513392686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.4421721398830414,
+      "step": 2208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1278.0,
+      "completions/max_terminated_length": 1278.0,
+      "completions/mean_length": 464.9375305175781,
+      "completions/mean_terminated_length": 464.9375305175781,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 2.27959762703121,
+      "grad_norm": 0.6202963590621948,
+      "kl": 0.1119384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0008,
+      "num_tokens": 258950604.0,
+      "reward": 1.4495537281036377,
+      "reward_std": 0.18529221415519714,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.4137009382247925,
+      "step": 2209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1027.0,
+      "completions/max_terminated_length": 1027.0,
+      "completions/mean_length": 474.6339416503906,
+      "completions/mean_terminated_length": 474.6339416503906,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 2.2806293525922103,
+      "grad_norm": 0.8014189004898071,
+      "kl": 0.11669921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 259076143.0,
+      "reward": 1.4544644355773926,
+      "reward_std": 0.1441306471824646,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45446428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.37719646096229553,
+      "step": 2210
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1049.0,
+      "completions/max_terminated_length": 1049.0,
+      "completions/mean_length": 449.26788330078125,
+      "completions/mean_terminated_length": 449.26788330078125,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 2.281661078153211,
+      "grad_norm": 0.794110119342804,
+      "kl": 0.129150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 259191659.0,
+      "reward": 1.5883928537368774,
+      "reward_std": 0.16244350373744965,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5883927941322327,
+      "rewards/curriculum_aware_reward_fn/std": 0.5123687386512756,
+      "step": 2211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 945.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 448.3660888671875,
+      "completions/mean_terminated_length": 448.3660888671875,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "epoch": 2.282692803714212,
+      "grad_norm": 0.7245016694068909,
+      "kl": 0.130615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0066,
+      "num_tokens": 259313745.0,
+      "reward": 1.4339287281036377,
+      "reward_std": 0.18925009667873383,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43392854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.44210872054100037,
+      "step": 2212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 447.21429443359375,
+      "completions/mean_terminated_length": 447.21429443359375,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "epoch": 2.283724529275213,
+      "grad_norm": 0.844030499458313,
+      "kl": 0.136962890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0032,
+      "num_tokens": 259432657.0,
+      "reward": 1.4325894117355347,
+      "reward_std": 0.20370341837406158,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4325892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.42694756388664246,
+      "step": 2213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 937.0,
+      "completions/max_terminated_length": 937.0,
+      "completions/mean_length": 427.90179443359375,
+      "completions/mean_terminated_length": 427.90179443359375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.2847562548362137,
+      "grad_norm": 0.7148916125297546,
+      "kl": 0.127197265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0047,
+      "num_tokens": 259546844.0,
+      "reward": 1.5227677822113037,
+      "reward_std": 0.25406861305236816,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5316964387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.4465582072734833,
+      "step": 2214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1162.0,
+      "completions/max_terminated_length": 1162.0,
+      "completions/mean_length": 485.4285888671875,
+      "completions/mean_terminated_length": 485.4285888671875,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.285787980397214,
+      "grad_norm": 0.6173986196517944,
+      "kl": 0.1173095703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0327,
+      "num_tokens": 259672822.0,
+      "reward": 1.4812500476837158,
+      "reward_std": 0.1721828132867813,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48125001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.4544140100479126,
+      "step": 2215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1214.0,
+      "completions/max_terminated_length": 1214.0,
+      "completions/mean_length": 458.0625305175781,
+      "completions/mean_terminated_length": 458.0625305175781,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.286819705958215,
+      "grad_norm": 0.7467107176780701,
+      "kl": 0.12646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0235,
+      "num_tokens": 259791297.0,
+      "reward": 1.3580358028411865,
+      "reward_std": 0.12079530954360962,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35803571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3610890507698059,
+      "step": 2216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1018.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 455.5535888671875,
+      "completions/mean_terminated_length": 455.5535888671875,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 2.2878514315192158,
+      "grad_norm": 0.7681455016136169,
+      "kl": 0.1279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 259909736.0,
+      "reward": 1.3901787996292114,
+      "reward_std": 0.19109110534191132,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39017853140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.3828437328338623,
+      "step": 2217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 990.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 416.0357360839844,
+      "completions/mean_terminated_length": 416.0357360839844,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 2.2888831570802166,
+      "grad_norm": 0.8476653099060059,
+      "kl": 0.13671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0122,
+      "num_tokens": 260027781.0,
+      "reward": 1.6607143878936768,
+      "reward_std": 0.09141557663679123,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6607142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.3630097210407257,
+      "step": 2218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 753.0,
+      "completions/max_terminated_length": 753.0,
+      "completions/mean_length": 432.51788330078125,
+      "completions/mean_terminated_length": 432.51788330078125,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 2.2899148826412175,
+      "grad_norm": 0.8101479411125183,
+      "kl": 0.1180419921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0078,
+      "num_tokens": 260153475.0,
+      "reward": 1.5424107313156128,
+      "reward_std": 0.20149752497673035,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.41078969836235046,
+      "step": 2219
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 759.0,
+      "completions/max_terminated_length": 759.0,
+      "completions/mean_length": 405.65179443359375,
+      "completions/mean_terminated_length": 405.65179443359375,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 2.2909466082022183,
+      "grad_norm": 0.8121070861816406,
+      "kl": 0.1209716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 260271540.0,
+      "reward": 1.6486608982086182,
+      "reward_std": 0.18063603341579437,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6486607193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.42066335678100586,
+      "step": 2220
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1101.0,
+      "completions/max_terminated_length": 1101.0,
+      "completions/mean_length": 442.3482360839844,
+      "completions/mean_terminated_length": 442.3482360839844,
+      "completions/min_length": 113.0,
+      "completions/min_terminated_length": 113.0,
+      "epoch": 2.291978333763219,
+      "grad_norm": 0.677873969078064,
+      "kl": 0.1268310546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0207,
+      "num_tokens": 260389123.0,
+      "reward": 1.462053656578064,
+      "reward_std": 0.2162722647190094,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4620535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.4583187997341156,
+      "step": 2221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1160.0,
+      "completions/max_terminated_length": 1160.0,
+      "completions/mean_length": 481.1964416503906,
+      "completions/mean_terminated_length": 481.1964416503906,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 2.29301005932422,
+      "grad_norm": 0.6380321979522705,
+      "kl": 0.116455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 260510959.0,
+      "reward": 1.6214287281036377,
+      "reward_std": 0.1788286566734314,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6214286088943481,
+      "rewards/curriculum_aware_reward_fn/std": 0.4121856689453125,
+      "step": 2222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 725.0,
+      "completions/max_terminated_length": 725.0,
+      "completions/mean_length": 390.4732360839844,
+      "completions/mean_terminated_length": 390.4732360839844,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 2.2940417848852204,
+      "grad_norm": 0.8966089487075806,
+      "kl": 0.13671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0197,
+      "num_tokens": 260621458.0,
+      "reward": 1.5357143878936768,
+      "reward_std": 0.21034854650497437,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.4012608826160431,
+      "step": 2223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 794.0,
+      "completions/max_terminated_length": 794.0,
+      "completions/mean_length": 472.7410888671875,
+      "completions/mean_terminated_length": 472.7410888671875,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.295073510446221,
+      "grad_norm": 1.1112465858459473,
+      "kl": 0.19580078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0032,
+      "num_tokens": 260750502.0,
+      "reward": 1.4330357313156128,
+      "reward_std": 0.1422305405139923,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.40606623888015747,
+      "step": 2224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1136.0,
+      "completions/max_terminated_length": 1136.0,
+      "completions/mean_length": 452.6785888671875,
+      "completions/mean_terminated_length": 452.6785888671875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.296105236007222,
+      "grad_norm": 0.8665755987167358,
+      "kl": 0.1282958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 260862172.0,
+      "reward": 1.536607265472412,
+      "reward_std": 0.1756691336631775,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5366071462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.36772873997688293,
+      "step": 2225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1264.0,
+      "completions/max_terminated_length": 1264.0,
+      "completions/mean_length": 445.3750305175781,
+      "completions/mean_terminated_length": 445.3750305175781,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.297136961568223,
+      "grad_norm": 0.8168461322784424,
+      "kl": 0.119140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0015,
+      "num_tokens": 260981970.0,
+      "reward": 1.604017972946167,
+      "reward_std": 0.18865753710269928,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6040178537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.42844459414482117,
+      "step": 2226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1155.0,
+      "completions/max_terminated_length": 1155.0,
+      "completions/mean_length": 475.0625305175781,
+      "completions/mean_terminated_length": 475.0625305175781,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 2.2981686871292237,
+      "grad_norm": 0.7673314213752747,
+      "kl": 0.1199951171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 261108054.0,
+      "reward": 1.4441964626312256,
+      "reward_std": 0.21315507590770721,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4441964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.4495115280151367,
+      "step": 2227
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1077.0,
+      "completions/max_terminated_length": 1077.0,
+      "completions/mean_length": 455.3750305175781,
+      "completions/mean_terminated_length": 455.3750305175781,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 2.2992004126902246,
+      "grad_norm": 0.6256400942802429,
+      "kl": 0.140380859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0269,
+      "num_tokens": 261222063.0,
+      "reward": 1.375892996788025,
+      "reward_std": 0.18649274110794067,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37589284777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.45102396607398987,
+      "step": 2228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 894.0,
+      "completions/max_terminated_length": 894.0,
+      "completions/mean_length": 446.2232360839844,
+      "completions/mean_terminated_length": 446.2232360839844,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 2.300232138251225,
+      "grad_norm": 0.7473752498626709,
+      "kl": 0.1207275390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0046,
+      "num_tokens": 261335362.0,
+      "reward": 1.4924107789993286,
+      "reward_std": 0.18438197672367096,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49241071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.4038781523704529,
+      "step": 2229
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1218.0,
+      "completions/max_terminated_length": 1218.0,
+      "completions/mean_length": 487.7410888671875,
+      "completions/mean_terminated_length": 487.7410888671875,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.301263863812226,
+      "grad_norm": 0.8416672348976135,
+      "kl": 0.1160888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.018,
+      "num_tokens": 261459968.0,
+      "reward": 1.4116073846817017,
+      "reward_std": 0.2052520364522934,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41160711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.43090006709098816,
+      "step": 2230
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1127.0,
+      "completions/mean_length": 439.232177734375,
+      "completions/mean_terminated_length": 406.2882995605469,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 2.3022955893732266,
+      "grad_norm": 0.7082986831665039,
+      "kl": 0.1705322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0575,
+      "num_tokens": 261574220.0,
+      "reward": 1.5107144117355347,
+      "reward_std": 0.24220815300941467,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5285714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.40568673610687256,
+      "step": 2231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 453.0982360839844,
+      "completions/mean_terminated_length": 453.0982360839844,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 2.3033273149342275,
+      "grad_norm": 0.7227510809898376,
+      "kl": 0.128662109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0082,
+      "num_tokens": 261695828.0,
+      "reward": 1.6334823369979858,
+      "reward_std": 0.18725861608982086,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6334820985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.3946368098258972,
+      "step": 2232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2562.0,
+      "completions/max_terminated_length": 2562.0,
+      "completions/mean_length": 444.6964416503906,
+      "completions/mean_terminated_length": 444.6964416503906,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 2.3043590404952283,
+      "grad_norm": 0.8308013081550598,
+      "kl": 0.137451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0517,
+      "num_tokens": 261813408.0,
+      "reward": 1.4589287042617798,
+      "reward_std": 0.26964929699897766,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.44402584433555603,
+      "step": 2233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 846.0,
+      "completions/max_terminated_length": 846.0,
+      "completions/mean_length": 425.9375305175781,
+      "completions/mean_terminated_length": 425.9375305175781,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.305390766056229,
+      "grad_norm": 0.772957444190979,
+      "kl": 0.131103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0012,
+      "num_tokens": 261927137.0,
+      "reward": 1.4250000715255737,
+      "reward_std": 0.1946423500776291,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43392854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3968464732170105,
+      "step": 2234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 977.0,
+      "completions/mean_length": 473.7589416503906,
+      "completions/mean_terminated_length": 441.1261291503906,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 2.30642249161723,
+      "grad_norm": 0.7113568782806396,
+      "kl": 0.11328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0474,
+      "num_tokens": 262055840.0,
+      "reward": 1.4651787281036377,
+      "reward_std": 0.2366374433040619,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47410717606544495,
+      "rewards/curriculum_aware_reward_fn/std": 0.43959349393844604,
+      "step": 2235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1298.0,
+      "completions/max_terminated_length": 1298.0,
+      "completions/mean_length": 500.0535888671875,
+      "completions/mean_terminated_length": 500.0535888671875,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 2.3074542171782304,
+      "grad_norm": 0.7893040776252747,
+      "kl": 0.10888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0275,
+      "num_tokens": 262179844.0,
+      "reward": 1.3928571939468384,
+      "reward_std": 0.22476813197135925,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3928571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4019578993320465,
+      "step": 2236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1026.0,
+      "completions/max_terminated_length": 1026.0,
+      "completions/mean_length": 500.2589416503906,
+      "completions/mean_terminated_length": 500.2589416503906,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.3084859427392312,
+      "grad_norm": 0.7808545827865601,
+      "kl": 0.12548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0198,
+      "num_tokens": 262315743.0,
+      "reward": 1.3504464626312256,
+      "reward_std": 0.2415580451488495,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.359375,
+      "rewards/curriculum_aware_reward_fn/std": 0.39218324422836304,
+      "step": 2237
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1035.0,
+      "completions/max_terminated_length": 1035.0,
+      "completions/mean_length": 445.33929443359375,
+      "completions/mean_terminated_length": 445.33929443359375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.309517668300232,
+      "grad_norm": 0.8054758906364441,
+      "kl": 0.1317138671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0297,
+      "num_tokens": 262437539.0,
+      "reward": 1.5839285850524902,
+      "reward_std": 0.25896844267845154,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5839285850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4052145481109619,
+      "step": 2238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1190.0,
+      "completions/max_terminated_length": 1190.0,
+      "completions/mean_length": 464.96429443359375,
+      "completions/mean_terminated_length": 464.96429443359375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 2.310549393861233,
+      "grad_norm": 0.5957467555999756,
+      "kl": 0.1160888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0114,
+      "num_tokens": 262550046.0,
+      "reward": 1.5642858743667603,
+      "reward_std": 0.14827460050582886,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5642856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.3665291666984558,
+      "step": 2239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1060.0,
+      "completions/max_terminated_length": 1060.0,
+      "completions/mean_length": 480.6875305175781,
+      "completions/mean_terminated_length": 480.6875305175781,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "epoch": 2.3115811194222338,
+      "grad_norm": 0.7734221816062927,
+      "kl": 0.122802734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0038,
+      "num_tokens": 262675164.0,
+      "reward": 1.3772321939468384,
+      "reward_std": 0.23709361255168915,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3772321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.44153133034706116,
+      "step": 2240
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1401.0,
+      "completions/max_terminated_length": 1401.0,
+      "completions/mean_length": 484.4464416503906,
+      "completions/mean_terminated_length": 484.4464416503906,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 2.3126128449832346,
+      "grad_norm": 0.611352264881134,
+      "kl": 0.1220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0001,
+      "num_tokens": 262796594.0,
+      "reward": 1.5602680444717407,
+      "reward_std": 0.2347055971622467,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5691964030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.4562007784843445,
+      "step": 2241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1010.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 409.9732360839844,
+      "completions/mean_terminated_length": 409.9732360839844,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 2.313644570544235,
+      "grad_norm": 0.7636814117431641,
+      "kl": 0.134521484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0353,
+      "num_tokens": 262907338.0,
+      "reward": 1.5718750953674316,
+      "reward_std": 0.25764763355255127,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5718750357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.46230822801589966,
+      "step": 2242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1714.0,
+      "completions/max_terminated_length": 1714.0,
+      "completions/mean_length": 509.14288330078125,
+      "completions/mean_terminated_length": 509.14288330078125,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 2.314676296105236,
+      "grad_norm": 0.8017022013664246,
+      "kl": 0.1229248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0261,
+      "num_tokens": 263037484.0,
+      "reward": 1.4218751192092896,
+      "reward_std": 0.193229541182518,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.421875,
+      "rewards/curriculum_aware_reward_fn/std": 0.37119126319885254,
+      "step": 2243
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1883.0,
+      "completions/max_terminated_length": 1883.0,
+      "completions/mean_length": 455.5535888671875,
+      "completions/mean_terminated_length": 455.5535888671875,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 2.3157080216662367,
+      "grad_norm": 0.780221164226532,
+      "kl": 0.1318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0138,
+      "num_tokens": 263157379.0,
+      "reward": 1.555803656578064,
+      "reward_std": 0.23983052372932434,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5558035373687744,
+      "rewards/curriculum_aware_reward_fn/std": 0.43088769912719727,
+      "step": 2244
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1017.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 393.4196472167969,
+      "completions/mean_terminated_length": 393.4196472167969,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 2.3167397472272375,
+      "grad_norm": 0.7895362973213196,
+      "kl": 0.141845703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0141,
+      "num_tokens": 263264925.0,
+      "reward": 1.599107265472412,
+      "reward_std": 0.1769608110189438,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6169642806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.4076163172721863,
+      "step": 2245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 434.7410888671875,
+      "completions/mean_terminated_length": 434.7410888671875,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 2.3177714727882384,
+      "grad_norm": 0.677725613117218,
+      "kl": 0.13525390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0109,
+      "num_tokens": 263372675.0,
+      "reward": 1.3598215579986572,
+      "reward_std": 0.14816580712795258,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3598214089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.3798908591270447,
+      "step": 2246
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 860.0,
+      "completions/max_terminated_length": 860.0,
+      "completions/mean_length": 457.8839416503906,
+      "completions/mean_terminated_length": 457.8839416503906,
+      "completions/min_length": 120.0,
+      "completions/min_terminated_length": 120.0,
+      "epoch": 2.318803198349239,
+      "grad_norm": 0.7738561034202576,
+      "kl": 0.1317138671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0075,
+      "num_tokens": 263488267.0,
+      "reward": 1.5080358982086182,
+      "reward_std": 0.2324211597442627,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5080357193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.42487698793411255,
+      "step": 2247
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 858.0,
+      "completions/max_terminated_length": 858.0,
+      "completions/mean_length": 414.21429443359375,
+      "completions/mean_terminated_length": 414.21429443359375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.31983492391024,
+      "grad_norm": 0.7853266596794128,
+      "kl": 0.1280517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 263593515.0,
+      "reward": 1.641517996788025,
+      "reward_std": 0.1926991194486618,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6415179371833801,
+      "rewards/curriculum_aware_reward_fn/std": 0.42521265149116516,
+      "step": 2248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3256.0,
+      "completions/max_terminated_length": 3256.0,
+      "completions/mean_length": 452.5535888671875,
+      "completions/mean_terminated_length": 452.5535888671875,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.320866649471241,
+      "grad_norm": 0.6526082158088684,
+      "kl": 0.13037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.019,
+      "num_tokens": 263714845.0,
+      "reward": 1.4147322177886963,
+      "reward_std": 0.13459168374538422,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4147321581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.4585837125778198,
+      "step": 2249
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2341.0,
+      "completions/max_terminated_length": 2341.0,
+      "completions/mean_length": 583.8214721679688,
+      "completions/mean_terminated_length": 583.8214721679688,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 2.3218983750322413,
+      "grad_norm": 0.6811022162437439,
+      "kl": 0.1121826171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0076,
+      "num_tokens": 263856484.0,
+      "reward": 1.2991071939468384,
+      "reward_std": 0.17378173768520355,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3080357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.3994677662849426,
+      "step": 2250
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1736.0,
+      "completions/max_terminated_length": 1736.0,
+      "completions/mean_length": 508.9910888671875,
+      "completions/mean_terminated_length": 508.9910888671875,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 2.322930100593242,
+      "grad_norm": 0.8488061428070068,
+      "kl": 0.1119384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0068,
+      "num_tokens": 263986858.0,
+      "reward": 1.4433037042617798,
+      "reward_std": 0.19707278907299042,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44330358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.4065065085887909,
+      "step": 2251
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1205.0,
+      "completions/max_terminated_length": 1205.0,
+      "completions/mean_length": 469.1250305175781,
+      "completions/mean_terminated_length": 469.1250305175781,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 2.323961826154243,
+      "grad_norm": 0.7594837546348572,
+      "kl": 0.1251220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0166,
+      "num_tokens": 264111955.0,
+      "reward": 1.4861608743667603,
+      "reward_std": 0.2393674999475479,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49508926272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.4587958753108978,
+      "step": 2252
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1379.0,
+      "completions/max_terminated_length": 1379.0,
+      "completions/mean_length": 482.4910888671875,
+      "completions/mean_terminated_length": 482.4910888671875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.324993551715244,
+      "grad_norm": 0.8010709881782532,
+      "kl": 0.1263427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0297,
+      "num_tokens": 264225312.0,
+      "reward": 1.3026787042617798,
+      "reward_std": 0.1675240397453308,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30267858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.32081252336502075,
+      "step": 2253
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1125.0,
+      "completions/max_terminated_length": 1125.0,
+      "completions/mean_length": 484.5714416503906,
+      "completions/mean_terminated_length": 484.5714416503906,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 2.3260252772762446,
+      "grad_norm": 0.7815201878547668,
+      "kl": 0.111572265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0204,
+      "num_tokens": 264345083.0,
+      "reward": 1.400892972946167,
+      "reward_std": 0.2504502832889557,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41875001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.4531235098838806,
+      "step": 2254
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1900.0,
+      "completions/max_terminated_length": 1900.0,
+      "completions/mean_length": 509.9107360839844,
+      "completions/mean_terminated_length": 509.9107360839844,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 2.3270570028372455,
+      "grad_norm": 0.7713619470596313,
+      "kl": 0.129150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0363,
+      "num_tokens": 264469722.0,
+      "reward": 1.427232265472412,
+      "reward_std": 0.1826336532831192,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42723211646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.3809003233909607,
+      "step": 2255
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 991.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 414.51788330078125,
+      "completions/mean_terminated_length": 414.51788330078125,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.328088728398246,
+      "grad_norm": 0.6340947151184082,
+      "kl": 0.127685546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0023,
+      "num_tokens": 264579407.0,
+      "reward": 1.6540179252624512,
+      "reward_std": 0.11283912509679794,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6540178656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.42849716544151306,
+      "step": 2256
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1256.0,
+      "completions/max_terminated_length": 1256.0,
+      "completions/mean_length": 479.4464416503906,
+      "completions/mean_terminated_length": 479.4464416503906,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 2.3291204539592467,
+      "grad_norm": 0.7967779040336609,
+      "kl": 0.1357421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0253,
+      "num_tokens": 264713101.0,
+      "reward": 1.4651787281036377,
+      "reward_std": 0.19649997353553772,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47410711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.3908861577510834,
+      "step": 2257
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1696.0,
+      "completions/max_terminated_length": 1696.0,
+      "completions/mean_length": 446.857177734375,
+      "completions/mean_terminated_length": 446.857177734375,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 2.3301521795202476,
+      "grad_norm": 0.7497438788414001,
+      "kl": 0.1353759765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0182,
+      "num_tokens": 264834654.0,
+      "reward": 1.6334823369979858,
+      "reward_std": 0.25653204321861267,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6334820985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.3811663091182709,
+      "step": 2258
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1125.0,
+      "completions/max_terminated_length": 1125.0,
+      "completions/mean_length": 455.8482360839844,
+      "completions/mean_terminated_length": 455.8482360839844,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 2.3311839050812484,
+      "grad_norm": 0.8482918739318848,
+      "kl": 0.1436767578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0481,
+      "num_tokens": 264950536.0,
+      "reward": 1.4495537281036377,
+      "reward_std": 0.22892731428146362,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3629518449306488,
+      "step": 2259
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1127.0,
+      "completions/max_terminated_length": 1127.0,
+      "completions/mean_length": 439.6785888671875,
+      "completions/mean_terminated_length": 439.6785888671875,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 2.3322156306422492,
+      "grad_norm": 0.7075839042663574,
+      "kl": 0.130615234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 265072720.0,
+      "reward": 1.532589316368103,
+      "reward_std": 0.18878993391990662,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.4554331600666046,
+      "step": 2260
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 441.6785888671875,
+      "completions/mean_terminated_length": 441.6785888671875,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 2.33324735620325,
+      "grad_norm": 0.7362609505653381,
+      "kl": 0.138671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0207,
+      "num_tokens": 265199881.0,
+      "reward": 1.5388394594192505,
+      "reward_std": 0.23343265056610107,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5388392806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.44187918305397034,
+      "step": 2261
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1492.0,
+      "completions/max_terminated_length": 1492.0,
+      "completions/mean_length": 487.6160888671875,
+      "completions/mean_terminated_length": 487.6160888671875,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.334279081764251,
+      "grad_norm": 0.6873180270195007,
+      "kl": 0.1212158203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0413,
+      "num_tokens": 265313763.0,
+      "reward": 1.5026787519454956,
+      "reward_std": 0.15679128468036652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5026785731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4019568860530853,
+      "step": 2262
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1436.0,
+      "completions/max_terminated_length": 1436.0,
+      "completions/mean_length": 500.4375305175781,
+      "completions/mean_terminated_length": 500.4375305175781,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 2.3353108073252513,
+      "grad_norm": 0.7277102470397949,
+      "kl": 0.1156005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0008,
+      "num_tokens": 265439469.0,
+      "reward": 1.4388394355773926,
+      "reward_std": 0.2303120493888855,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43883928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3845384418964386,
+      "step": 2263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1172.0,
+      "completions/max_terminated_length": 1172.0,
+      "completions/mean_length": 453.33929443359375,
+      "completions/mean_terminated_length": 453.33929443359375,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 2.336342532886252,
+      "grad_norm": 0.7865290641784668,
+      "kl": 0.14306640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0001,
+      "num_tokens": 265553516.0,
+      "reward": 1.4803574085235596,
+      "reward_std": 0.18225398659706116,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48035717010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.35938870906829834,
+      "step": 2264
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1266.0,
+      "completions/max_terminated_length": 1266.0,
+      "completions/mean_length": 477.8660888671875,
+      "completions/mean_terminated_length": 477.8660888671875,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.337374258447253,
+      "grad_norm": 0.6291416883468628,
+      "kl": 0.12646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 265678626.0,
+      "reward": 1.3834823369979858,
+      "reward_std": 0.18859294056892395,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3834821283817291,
+      "rewards/curriculum_aware_reward_fn/std": 0.424231618642807,
+      "step": 2265
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1324.0,
+      "completions/max_terminated_length": 1324.0,
+      "completions/mean_length": 446.9732360839844,
+      "completions/mean_terminated_length": 446.9732360839844,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.338405984008254,
+      "grad_norm": 0.775569498538971,
+      "kl": 0.1163330078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0067,
+      "num_tokens": 265794193.0,
+      "reward": 1.4651787281036377,
+      "reward_std": 0.15202991664409637,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46517857909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.43047425150871277,
+      "step": 2266
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1233.0,
+      "completions/max_terminated_length": 1233.0,
+      "completions/mean_length": 443.83038330078125,
+      "completions/mean_terminated_length": 443.83038330078125,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 2.3394377095692547,
+      "grad_norm": 0.8700284957885742,
+      "kl": 0.13916015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0251,
+      "num_tokens": 265907847.0,
+      "reward": 1.4928573369979858,
+      "reward_std": 0.23910239338874817,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5017857551574707,
+      "rewards/curriculum_aware_reward_fn/std": 0.4329829812049866,
+      "step": 2267
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1072.0,
+      "completions/max_terminated_length": 1072.0,
+      "completions/mean_length": 479.169677734375,
+      "completions/mean_terminated_length": 479.169677734375,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 2.3404694351302555,
+      "grad_norm": 0.848242461681366,
+      "kl": 0.138671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0144,
+      "num_tokens": 266036232.0,
+      "reward": 1.4566963911056519,
+      "reward_std": 0.18905363976955414,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45669645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.4086616039276123,
+      "step": 2268
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2355.0,
+      "completions/max_terminated_length": 2355.0,
+      "completions/mean_length": 502.2589416503906,
+      "completions/mean_terminated_length": 502.2589416503906,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 2.341501160691256,
+      "grad_norm": 0.6625986695289612,
+      "kl": 0.119873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 266163913.0,
+      "reward": 1.3316965103149414,
+      "reward_std": 0.17715652287006378,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.38370707631111145,
+      "step": 2269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 855.0,
+      "completions/max_terminated_length": 855.0,
+      "completions/mean_length": 425.3035888671875,
+      "completions/mean_terminated_length": 425.3035888671875,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.3425328862522568,
+      "grad_norm": 0.8239805698394775,
+      "kl": 0.136962890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0156,
+      "num_tokens": 266276549.0,
+      "reward": 1.4437501430511475,
+      "reward_std": 0.17334595322608948,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4437499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.4235268533229828,
+      "step": 2270
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1084.0,
+      "completions/max_terminated_length": 1084.0,
+      "completions/mean_length": 455.14288330078125,
+      "completions/mean_terminated_length": 455.14288330078125,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 2.3435646118132576,
+      "grad_norm": 0.7383701801300049,
+      "kl": 0.1275634765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0037,
+      "num_tokens": 266399905.0,
+      "reward": 1.5540181398391724,
+      "reward_std": 0.2154999077320099,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5540178418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.44261398911476135,
+      "step": 2271
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1198.0,
+      "completions/max_terminated_length": 1198.0,
+      "completions/mean_length": 447.4285888671875,
+      "completions/mean_terminated_length": 447.4285888671875,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 2.3445963373742584,
+      "grad_norm": 0.7752024531364441,
+      "kl": 0.134521484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0281,
+      "num_tokens": 266518057.0,
+      "reward": 1.5308037996292114,
+      "reward_std": 0.17774909734725952,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5308035612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.41731905937194824,
+      "step": 2272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1265.0,
+      "completions/max_terminated_length": 1265.0,
+      "completions/mean_length": 462.9910888671875,
+      "completions/mean_terminated_length": 462.9910888671875,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 2.3456280629352593,
+      "grad_norm": 0.7307128310203552,
+      "kl": 0.1312255859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0224,
+      "num_tokens": 266633808.0,
+      "reward": 1.4352679252624512,
+      "reward_std": 0.16883431375026703,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.42312854528427124,
+      "step": 2273
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2141.0,
+      "completions/mean_length": 510.8750305175781,
+      "completions/mean_terminated_length": 478.57659912109375,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 2.34665978849626,
+      "grad_norm": 0.6801409125328064,
+      "kl": 0.13232421875,
+      "learning_rate": 1e-06,
+      "loss": -0.036,
+      "num_tokens": 266755300.0,
+      "reward": 1.5232144594192505,
+      "reward_std": 0.12608736753463745,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5232142806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.4231972098350525,
+      "step": 2274
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2522.0,
+      "completions/max_terminated_length": 2522.0,
+      "completions/mean_length": 495.5535888671875,
+      "completions/mean_terminated_length": 495.5535888671875,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.347691514057261,
+      "grad_norm": 0.8027746677398682,
+      "kl": 0.123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0518,
+      "num_tokens": 266875237.0,
+      "reward": 1.4709821939468384,
+      "reward_std": 0.1888292133808136,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4709821343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.44095709919929504,
+      "step": 2275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1802.0,
+      "completions/max_terminated_length": 1802.0,
+      "completions/mean_length": 440.08038330078125,
+      "completions/mean_terminated_length": 440.08038330078125,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.3487232396182613,
+      "grad_norm": 0.8603218793869019,
+      "kl": 0.140380859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0097,
+      "num_tokens": 266987414.0,
+      "reward": 1.5241072177886963,
+      "reward_std": 0.1688319742679596,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5330356955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.42323046922683716,
+      "step": 2276
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1166.0,
+      "completions/max_terminated_length": 1166.0,
+      "completions/mean_length": 425.5357360839844,
+      "completions/mean_terminated_length": 425.5357360839844,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 2.349754965179262,
+      "grad_norm": 0.7801834344863892,
+      "kl": 0.1314697265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0518,
+      "num_tokens": 267094175.0,
+      "reward": 1.6169644594192505,
+      "reward_std": 0.25597792863845825,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6258928179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.43199628591537476,
+      "step": 2277
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3730.0,
+      "completions/max_terminated_length": 3730.0,
+      "completions/mean_length": 505.8125305175781,
+      "completions/mean_terminated_length": 505.8125305175781,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 2.350786690740263,
+      "grad_norm": 1.1834087371826172,
+      "kl": 0.130126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0245,
+      "num_tokens": 267217123.0,
+      "reward": 1.4629465341567993,
+      "reward_std": 0.17395009100437164,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46294641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.43947890400886536,
+      "step": 2278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1123.0,
+      "completions/max_terminated_length": 1123.0,
+      "completions/mean_length": 482.9285888671875,
+      "completions/mean_terminated_length": 482.9285888671875,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 2.351818416301264,
+      "grad_norm": 0.8050830960273743,
+      "kl": 0.13671875,
+      "learning_rate": 1e-06,
+      "loss": 0.046,
+      "num_tokens": 267333679.0,
+      "reward": 1.531250238418579,
+      "reward_std": 0.21188195049762726,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.53125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3829782009124756,
+      "step": 2279
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2027.0,
+      "completions/mean_length": 497.0089416503906,
+      "completions/mean_terminated_length": 464.5856018066406,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.3528501418622647,
+      "grad_norm": 0.6472569704055786,
+      "kl": 0.1285400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 267445897.0,
+      "reward": 1.46473228931427,
+      "reward_std": 0.18970660865306854,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4736607074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.4229859709739685,
+      "step": 2280
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1989.0,
+      "completions/max_terminated_length": 1989.0,
+      "completions/mean_length": 434.9375305175781,
+      "completions/mean_terminated_length": 434.9375305175781,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "epoch": 2.3538818674232656,
+      "grad_norm": 0.8479146957397461,
+      "kl": 0.141357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 267565982.0,
+      "reward": 1.579017996788025,
+      "reward_std": 0.23399677872657776,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5790179371833801,
+      "rewards/curriculum_aware_reward_fn/std": 0.39178720116615295,
+      "step": 2281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2161.0,
+      "completions/max_terminated_length": 2161.0,
+      "completions/mean_length": 502.1339416503906,
+      "completions/mean_terminated_length": 502.1339416503906,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.354913592984266,
+      "grad_norm": 0.763183057308197,
+      "kl": 0.13623046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0031,
+      "num_tokens": 267687636.0,
+      "reward": 1.3544644117355347,
+      "reward_std": 0.19222049415111542,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3544642925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.38663211464881897,
+      "step": 2282
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3179.0,
+      "completions/mean_length": 527.7589721679688,
+      "completions/mean_terminated_length": 495.61260986328125,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.355945318545267,
+      "grad_norm": 0.7211725115776062,
+      "kl": 0.139892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0191,
+      "num_tokens": 267812230.0,
+      "reward": 1.559821605682373,
+      "reward_std": 0.24818488955497742,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5687500238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.45182934403419495,
+      "step": 2283
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2126.0,
+      "completions/mean_length": 521.1964721679688,
+      "completions/mean_terminated_length": 456.1999816894531,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 2.3569770441062676,
+      "grad_norm": 0.6131947636604309,
+      "kl": 0.11376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0256,
+      "num_tokens": 267944300.0,
+      "reward": 1.596428632736206,
+      "reward_std": 0.1304352879524231,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5964285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.43701043725013733,
+      "step": 2284
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1044.0,
+      "completions/mean_length": 488.52679443359375,
+      "completions/mean_terminated_length": 456.02703857421875,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.3580087696672685,
+      "grad_norm": 0.7710050344467163,
+      "kl": 0.13134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0432,
+      "num_tokens": 268057549.0,
+      "reward": 1.5428574085235596,
+      "reward_std": 0.20850172638893127,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5517857670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.38717785477638245,
+      "step": 2285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2987.0,
+      "completions/mean_length": 477.2410888671875,
+      "completions/mean_terminated_length": 444.6396484375,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 2.3590404952282693,
+      "grad_norm": 0.66845703125,
+      "kl": 0.1307373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.001,
+      "num_tokens": 268173615.0,
+      "reward": 1.6566965579986572,
+      "reward_std": 0.12179917097091675,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6566964387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.41554778814315796,
+      "step": 2286
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3478.0,
+      "completions/max_terminated_length": 3478.0,
+      "completions/mean_length": 483.15179443359375,
+      "completions/mean_terminated_length": 483.15179443359375,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.36007222078927,
+      "grad_norm": 0.7215074300765991,
+      "kl": 0.124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0232,
+      "num_tokens": 268292324.0,
+      "reward": 1.5687501430511475,
+      "reward_std": 0.18647773563861847,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5687499642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.46938356757164,
+      "step": 2287
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2271.0,
+      "completions/mean_length": 580.8214721679688,
+      "completions/mean_terminated_length": 516.9090576171875,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 2.361103946350271,
+      "grad_norm": 0.7281975150108337,
+      "kl": 0.13330078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0023,
+      "num_tokens": 268422973.0,
+      "reward": 1.333035945892334,
+      "reward_std": 0.22421413660049438,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3419643044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3631415367126465,
+      "step": 2288
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1779.0,
+      "completions/mean_length": 479.3482360839844,
+      "completions/mean_terminated_length": 446.7657775878906,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 2.362135671911272,
+      "grad_norm": 0.829347550868988,
+      "kl": 0.155517578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0273,
+      "num_tokens": 268533493.0,
+      "reward": 1.5982143878936768,
+      "reward_std": 0.19711095094680786,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6071428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.42462795972824097,
+      "step": 2289
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1011.0,
+      "completions/mean_length": 573.6964721679688,
+      "completions/mean_terminated_length": 476.75225830078125,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 2.3631673974722722,
+      "grad_norm": 0.7868594527244568,
+      "kl": 0.130126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0456,
+      "num_tokens": 268666263.0,
+      "reward": 1.456696629524231,
+      "reward_std": 0.2679513692855835,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45669645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.4117363393306732,
+      "step": 2290
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1440.0,
+      "completions/max_terminated_length": 1440.0,
+      "completions/mean_length": 471.52679443359375,
+      "completions/mean_terminated_length": 471.52679443359375,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.364199123033273,
+      "grad_norm": 0.8038163781166077,
+      "kl": 0.140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 268788206.0,
+      "reward": 1.5169644355773926,
+      "reward_std": 0.21957655251026154,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5348213911056519,
+      "rewards/curriculum_aware_reward_fn/std": 0.44331061840057373,
+      "step": 2291
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3965.0,
+      "completions/mean_length": 577.1875,
+      "completions/mean_terminated_length": 480.33941650390625,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 2.365230848594274,
+      "grad_norm": 0.6000884175300598,
+      "kl": 0.145263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 268917678.0,
+      "reward": 1.5205358266830444,
+      "reward_std": 0.10521666705608368,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5205357074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.4275718927383423,
+      "step": 2292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1222.0,
+      "completions/mean_length": 442.0982360839844,
+      "completions/mean_terminated_length": 409.18017578125,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 2.3662625741552747,
+      "grad_norm": 0.7253805994987488,
+      "kl": 0.145751953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 269031334.0,
+      "reward": 1.6093751192092896,
+      "reward_std": 0.17531929910182953,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.609375,
+      "rewards/curriculum_aware_reward_fn/std": 0.38744550943374634,
+      "step": 2293
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2273.0,
+      "completions/mean_length": 560.7232666015625,
+      "completions/mean_terminated_length": 528.8739013671875,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 2.3672942997162756,
+      "grad_norm": 0.6465750932693481,
+      "kl": 0.127685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0492,
+      "num_tokens": 269163176.0,
+      "reward": 1.4004465341567993,
+      "reward_std": 0.19433052837848663,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40044641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.3976561725139618,
+      "step": 2294
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1605.0,
+      "completions/max_terminated_length": 1605.0,
+      "completions/mean_length": 472.6607360839844,
+      "completions/mean_terminated_length": 472.6607360839844,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 2.3683260252772764,
+      "grad_norm": 0.7045727372169495,
+      "kl": 0.1318359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0369,
+      "num_tokens": 269284777.0,
+      "reward": 1.4316965341567993,
+      "reward_std": 0.2325831949710846,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.42162978649139404,
+      "step": 2295
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1876.0,
+      "completions/mean_length": 544.6964721679688,
+      "completions/mean_terminated_length": 446.9541015625,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 2.369357750838277,
+      "grad_norm": 0.7127491235733032,
+      "kl": 0.1331787109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0304,
+      "num_tokens": 269415090.0,
+      "reward": 1.4366071224212646,
+      "reward_std": 0.18148307502269745,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.4378480315208435,
+      "step": 2296
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2440.0,
+      "completions/mean_length": 551.4732666015625,
+      "completions/mean_terminated_length": 487.0272521972656,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.3703894763992777,
+      "grad_norm": 0.5928733348846436,
+      "kl": 0.138916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0211,
+      "num_tokens": 269546302.0,
+      "reward": 1.4285714626312256,
+      "reward_std": 0.18738706409931183,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4375,
+      "rewards/curriculum_aware_reward_fn/std": 0.43024715781211853,
+      "step": 2297
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1582.0,
+      "completions/mean_length": 522.6607666015625,
+      "completions/mean_terminated_length": 490.4684753417969,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 2.3714212019602785,
+      "grad_norm": 0.6640140414237976,
+      "kl": 0.140380859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0715,
+      "num_tokens": 269676604.0,
+      "reward": 1.4602679014205933,
+      "reward_std": 0.06861122697591782,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4691964089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.4369347095489502,
+      "step": 2298
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3210.0,
+      "completions/max_terminated_length": 3210.0,
+      "completions/mean_length": 555.9553833007812,
+      "completions/mean_terminated_length": 555.9553833007812,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 2.3724529275212793,
+      "grad_norm": 0.6696567535400391,
+      "kl": 0.131103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0035,
+      "num_tokens": 269805119.0,
+      "reward": 1.474107265472412,
+      "reward_std": 0.24978794157505035,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47410711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.4368695914745331,
+      "step": 2299
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1357.0,
+      "completions/mean_length": 597.0267944335938,
+      "completions/mean_terminated_length": 500.7247619628906,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 2.37348465308228,
+      "grad_norm": 0.6863952279090881,
+      "kl": 0.142578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0474,
+      "num_tokens": 269934734.0,
+      "reward": 1.4522322416305542,
+      "reward_std": 0.17944176495075226,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.4349789321422577,
+      "step": 2300
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1239.0,
+      "completions/max_terminated_length": 1239.0,
+      "completions/mean_length": 467.232177734375,
+      "completions/mean_terminated_length": 467.232177734375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 2.374516378643281,
+      "grad_norm": 0.7441481351852417,
+      "kl": 0.144775390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 270053109.0,
+      "reward": 1.5754464864730835,
+      "reward_std": 0.2411445826292038,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5754464268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.41940516233444214,
+      "step": 2301
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3599.0,
+      "completions/mean_length": 611.8660888671875,
+      "completions/mean_terminated_length": 548.5181884765625,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.375548104204282,
+      "grad_norm": 0.5696609020233154,
+      "kl": 0.132568359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0106,
+      "num_tokens": 270193864.0,
+      "reward": 1.5214287042617798,
+      "reward_std": 0.14514337480068207,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5214285850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4346332550048828,
+      "step": 2302
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3726.0,
+      "completions/mean_length": 564.125,
+      "completions/mean_terminated_length": 499.9090881347656,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 2.3765798297652823,
+      "grad_norm": 0.7344022989273071,
+      "kl": 0.148681640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0311,
+      "num_tokens": 270328600.0,
+      "reward": 1.5705357789993286,
+      "reward_std": 0.25526320934295654,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5705357193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.5795910954475403,
+      "step": 2303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1341.0,
+      "completions/mean_length": 550.0625,
+      "completions/mean_terminated_length": 485.59088134765625,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 2.377611555326283,
+      "grad_norm": 0.7728509306907654,
+      "kl": 0.1435546875,
+      "learning_rate": 1e-06,
+      "loss": 0.006,
+      "num_tokens": 270447824.0,
+      "reward": 1.5660713911056519,
+      "reward_std": 0.2546859383583069,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5660714507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.4277646541595459,
+      "step": 2304
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3232.0,
+      "completions/mean_length": 610.5535888671875,
+      "completions/mean_terminated_length": 547.1818237304688,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.378643280887284,
+      "grad_norm": 0.650519073009491,
+      "kl": 0.1356201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0031,
+      "num_tokens": 270597035.0,
+      "reward": 1.6129463911056519,
+      "reward_std": 0.170953169465065,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6218749284744263,
+      "rewards/curriculum_aware_reward_fn/std": 0.42297643423080444,
+      "step": 2305
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1710.0,
+      "completions/mean_length": 560.6517944335938,
+      "completions/mean_terminated_length": 496.3727111816406,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.379675006448285,
+      "grad_norm": 4.208312511444092,
+      "kl": 0.671630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 270725215.0,
+      "reward": 1.4825893640518188,
+      "reward_std": 0.24740009009838104,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4915178120136261,
+      "rewards/curriculum_aware_reward_fn/std": 0.4709530472755432,
+      "step": 2306
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4092.0,
+      "completions/mean_length": 612.625,
+      "completions/mean_terminated_length": 549.2908935546875,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 2.3807067320092856,
+      "grad_norm": 0.5898601412773132,
+      "kl": 0.1246337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0192,
+      "num_tokens": 270861036.0,
+      "reward": 1.501339316368103,
+      "reward_std": 0.14689765870571136,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5102678537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.4506821036338806,
+      "step": 2307
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2972.0,
+      "completions/max_terminated_length": 2972.0,
+      "completions/mean_length": 506.4375305175781,
+      "completions/mean_terminated_length": 506.4375305175781,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 2.3817384575702865,
+      "grad_norm": 0.7323086857795715,
+      "kl": 0.150634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0484,
+      "num_tokens": 270982269.0,
+      "reward": 1.5071427822113037,
+      "reward_std": 0.18541446328163147,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5160714387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.4374997615814209,
+      "step": 2308
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1938.0,
+      "completions/max_terminated_length": 1938.0,
+      "completions/mean_length": 574.0357666015625,
+      "completions/mean_terminated_length": 574.0357666015625,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 2.382770183131287,
+      "grad_norm": 0.7194310426712036,
+      "kl": 0.1356201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 271117268.0,
+      "reward": 1.3991073369979858,
+      "reward_std": 0.21801282465457916,
+      "rewards/code_format_reward/mean": 0.9642857313156128,
+      "rewards/code_format_reward/std": 0.18641091883182526,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43482139706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.39540374279022217,
+      "step": 2309
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2293.0,
+      "completions/mean_length": 613.232177734375,
+      "completions/mean_terminated_length": 549.9090576171875,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "epoch": 2.3838019086922877,
+      "grad_norm": 0.7925401329994202,
+      "kl": 0.14013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0518,
+      "num_tokens": 271259478.0,
+      "reward": 1.4066966772079468,
+      "reward_std": 0.3506569266319275,
+      "rewards/code_format_reward/mean": 0.8928571343421936,
+      "rewards/code_format_reward/std": 0.3106848895549774,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5138393044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.4164527356624603,
+      "step": 2310
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2534.0,
+      "completions/max_terminated_length": 2534.0,
+      "completions/mean_length": 562.2232666015625,
+      "completions/mean_terminated_length": 562.2232666015625,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 2.3848336342532885,
+      "grad_norm": 0.7765570878982544,
+      "kl": 0.13818359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0486,
+      "num_tokens": 271388646.0,
+      "reward": 1.3718751668930054,
+      "reward_std": 0.38008591532707214,
+      "rewards/code_format_reward/mean": 0.8392857313156128,
+      "rewards/code_format_reward/std": 0.368917852640152,
+      "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.44865715503692627,
+      "step": 2311
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2419.0,
+      "completions/mean_length": 662.4107666015625,
+      "completions/mean_terminated_length": 599.9818115234375,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 2.3858653598142894,
+      "grad_norm": 0.8191332221031189,
+      "kl": 0.136962890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0209,
+      "num_tokens": 271534292.0,
+      "reward": 1.2691963911056519,
+      "reward_std": 0.47810545563697815,
+      "rewards/code_format_reward/mean": 0.7678571343421936,
+      "rewards/code_format_reward/std": 0.4240972101688385,
+      "rewards/curriculum_aware_reward_fn/mean": 0.501339316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.4248189926147461,
+      "step": 2312
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1816.0,
+      "completions/max_terminated_length": 1816.0,
+      "completions/mean_length": 568.9017944335938,
+      "completions/mean_terminated_length": 568.9017944335938,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 2.38689708537529,
+      "grad_norm": 0.8380082845687866,
+      "kl": 0.137939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0386,
+      "num_tokens": 271667708.0,
+      "reward": 1.2825894355773926,
+      "reward_std": 0.5124881863594055,
+      "rewards/code_format_reward/mean": 0.7857142686843872,
+      "rewards/code_format_reward/std": 0.41217005252838135,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49687501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3966231942176819,
+      "step": 2313
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1416.0,
+      "completions/mean_length": 542.6517944335938,
+      "completions/mean_terminated_length": 510.6396484375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.387928810936291,
+      "grad_norm": 0.7812418937683105,
+      "kl": 0.1484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0299,
+      "num_tokens": 271790456.0,
+      "reward": 1.4799107313156128,
+      "reward_std": 0.3694676160812378,
+      "rewards/code_format_reward/mean": 0.9017857313156128,
+      "rewards/code_format_reward/std": 0.2989417314529419,
+      "rewards/curriculum_aware_reward_fn/mean": 0.578125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3827226459980011,
+      "step": 2314
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 589.4732666015625,
+      "completions/mean_terminated_length": 557.8828735351562,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 2.388960536497292,
+      "grad_norm": 0.7020928263664246,
+      "kl": 0.137939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0453,
+      "num_tokens": 271924720.0,
+      "reward": 1.4084821939468384,
+      "reward_std": 0.2678705155849457,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4016653597354889,
+      "step": 2315
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1615.0,
+      "completions/mean_length": 577.5892944335938,
+      "completions/mean_terminated_length": 545.8919067382812,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 2.3899922620582927,
+      "grad_norm": 0.7192690968513489,
+      "kl": 0.149169921875,
+      "learning_rate": 1e-06,
+      "loss": 0.055,
+      "num_tokens": 272055378.0,
+      "reward": 1.466071605682373,
+      "reward_std": 0.21608500182628632,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48392853140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.43827131390571594,
+      "step": 2316
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2519.0,
+      "completions/max_terminated_length": 2519.0,
+      "completions/mean_length": 522.3482666015625,
+      "completions/mean_terminated_length": 522.3482666015625,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 2.391023987619293,
+      "grad_norm": 0.743002712726593,
+      "kl": 0.139404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0163,
+      "num_tokens": 272180584.0,
+      "reward": 1.3870537281036377,
+      "reward_std": 0.18511782586574554,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40491071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.36223530769348145,
+      "step": 2317
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2154.0,
+      "completions/max_terminated_length": 2154.0,
+      "completions/mean_length": 482.33038330078125,
+      "completions/mean_terminated_length": 482.33038330078125,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 2.392055713180294,
+      "grad_norm": 0.6114434599876404,
+      "kl": 0.136474609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0142,
+      "num_tokens": 272306159.0,
+      "reward": 1.5455358028411865,
+      "reward_std": 0.10450504720211029,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5633928179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.4080265760421753,
+      "step": 2318
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1303.0,
+      "completions/max_terminated_length": 1303.0,
+      "completions/mean_length": 536.3660888671875,
+      "completions/mean_terminated_length": 536.3660888671875,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.393087438741295,
+      "grad_norm": 0.7736281156539917,
+      "kl": 0.141845703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 272438549.0,
+      "reward": 1.3486608266830444,
+      "reward_std": 0.26764869689941406,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3575893044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3818052113056183,
+      "step": 2319
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1726.0,
+      "completions/max_terminated_length": 1726.0,
+      "completions/mean_length": 490.52679443359375,
+      "completions/mean_terminated_length": 490.52679443359375,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.3941191643022957,
+      "grad_norm": 0.6975762248039246,
+      "kl": 0.14013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 272556922.0,
+      "reward": 1.6674107313156128,
+      "reward_std": 0.24090106785297394,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6674107313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4099390208721161,
+      "step": 2320
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1051.0,
+      "completions/max_terminated_length": 1051.0,
+      "completions/mean_length": 472.044677734375,
+      "completions/mean_terminated_length": 472.044677734375,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 2.3951508898632965,
+      "grad_norm": 0.7948324680328369,
+      "kl": 0.14599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0365,
+      "num_tokens": 272678230.0,
+      "reward": 1.4892858266830444,
+      "reward_std": 0.24986693263053894,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4892857074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.46728214621543884,
+      "step": 2321
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1366.0,
+      "completions/max_terminated_length": 1366.0,
+      "completions/mean_length": 532.8392944335938,
+      "completions/mean_terminated_length": 532.8392944335938,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "epoch": 2.396182615424297,
+      "grad_norm": 0.7188726663589478,
+      "kl": 0.148681640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0278,
+      "num_tokens": 272813424.0,
+      "reward": 1.5388394594192505,
+      "reward_std": 0.20557691156864166,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5388392806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.38954293727874756,
+      "step": 2322
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1163.0,
+      "completions/max_terminated_length": 1163.0,
+      "completions/mean_length": 501.26788330078125,
+      "completions/mean_terminated_length": 501.26788330078125,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.3972143409852977,
+      "grad_norm": 0.8085273504257202,
+      "kl": 0.14697265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0012,
+      "num_tokens": 272934204.0,
+      "reward": 1.4508929252624512,
+      "reward_std": 0.18216951191425323,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4508928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.40246883034706116,
+      "step": 2323
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1537.0,
+      "completions/mean_length": 575.857177734375,
+      "completions/mean_terminated_length": 544.1441650390625,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 2.3982460665462986,
+      "grad_norm": 0.8094402551651001,
+      "kl": 0.143310546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0251,
+      "num_tokens": 273070429.0,
+      "reward": 1.5200893878936768,
+      "reward_std": 0.19970080256462097,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5379464030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.3922242820262909,
+      "step": 2324
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2306.0,
+      "completions/max_terminated_length": 2306.0,
+      "completions/mean_length": 497.8660888671875,
+      "completions/mean_terminated_length": 497.8660888671875,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 2.3992777921072994,
+      "grad_norm": 0.6123159527778625,
+      "kl": 0.140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0331,
+      "num_tokens": 273186026.0,
+      "reward": 1.5879465341567993,
+      "reward_std": 0.1610545963048935,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5879464745521545,
+      "rewards/curriculum_aware_reward_fn/std": 0.4583187997341156,
+      "step": 2325
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1580.0,
+      "completions/max_terminated_length": 1580.0,
+      "completions/mean_length": 508.1607360839844,
+      "completions/mean_terminated_length": 508.1607360839844,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 2.4003095176683003,
+      "grad_norm": 0.6526829600334167,
+      "kl": 0.15283203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0024,
+      "num_tokens": 273312177.0,
+      "reward": 1.6111607551574707,
+      "reward_std": 0.21387022733688354,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6111606955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.48887211084365845,
+      "step": 2326
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1584.0,
+      "completions/max_terminated_length": 1584.0,
+      "completions/mean_length": 497.1964416503906,
+      "completions/mean_terminated_length": 497.1964416503906,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 2.401341243229301,
+      "grad_norm": 0.7623741030693054,
+      "kl": 0.14404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 273438612.0,
+      "reward": 1.440178632736206,
+      "reward_std": 0.14009803533554077,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44017860293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.4423951804637909,
+      "step": 2327
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1524.0,
+      "completions/max_terminated_length": 1524.0,
+      "completions/mean_length": 552.3392944335938,
+      "completions/mean_terminated_length": 552.3392944335938,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 2.402372968790302,
+      "grad_norm": 0.5391533374786377,
+      "kl": 0.1292724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0072,
+      "num_tokens": 273565707.0,
+      "reward": 1.4424108266830444,
+      "reward_std": 0.14713065326213837,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4424107074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.4474255442619324,
+      "step": 2328
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 851.0,
+      "completions/max_terminated_length": 851.0,
+      "completions/mean_length": 467.5982360839844,
+      "completions/mean_terminated_length": 467.5982360839844,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 2.4034046943513028,
+      "grad_norm": 0.8083183169364929,
+      "kl": 0.15283203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 273684543.0,
+      "reward": 1.4950894117355347,
+      "reward_std": 0.20851272344589233,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.44377371668815613,
+      "step": 2329
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1367.0,
+      "completions/max_terminated_length": 1367.0,
+      "completions/mean_length": 527.875,
+      "completions/mean_terminated_length": 527.875,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 2.404436419912303,
+      "grad_norm": 0.6930996775627136,
+      "kl": 0.137451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0285,
+      "num_tokens": 273816815.0,
+      "reward": 1.446874976158142,
+      "reward_std": 0.17862723767757416,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44687503576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.4424685835838318,
+      "step": 2330
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1438.0,
+      "completions/max_terminated_length": 1438.0,
+      "completions/mean_length": 544.3303833007812,
+      "completions/mean_terminated_length": 544.3303833007812,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 2.405468145473304,
+      "grad_norm": 0.7719082832336426,
+      "kl": 0.146728515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0278,
+      "num_tokens": 273940647.0,
+      "reward": 1.516964316368103,
+      "reward_std": 0.20712628960609436,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.41340065002441406,
+      "step": 2331
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 968.0,
+      "completions/max_terminated_length": 968.0,
+      "completions/mean_length": 493.83929443359375,
+      "completions/mean_terminated_length": 493.83929443359375,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.406499871034305,
+      "grad_norm": 0.7389146685600281,
+      "kl": 0.150634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0144,
+      "num_tokens": 274066468.0,
+      "reward": 1.3535715341567993,
+      "reward_std": 0.18684786558151245,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.4170088768005371,
+      "step": 2332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1510.0,
+      "completions/max_terminated_length": 1510.0,
+      "completions/mean_length": 531.75,
+      "completions/mean_terminated_length": 531.75,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 2.4075315965953057,
+      "grad_norm": 0.7130151987075806,
+      "kl": 0.130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0211,
+      "num_tokens": 274195058.0,
+      "reward": 1.3089287281036377,
+      "reward_std": 0.20015646517276764,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30892854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3642970621585846,
+      "step": 2333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1272.0,
+      "completions/max_terminated_length": 1272.0,
+      "completions/mean_length": 506.4107360839844,
+      "completions/mean_terminated_length": 506.4107360839844,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 2.4085633221563065,
+      "grad_norm": 0.6497871279716492,
+      "kl": 0.14111328125,
+      "learning_rate": 1e-06,
+      "loss": -0.016,
+      "num_tokens": 274317590.0,
+      "reward": 1.6272321939468384,
+      "reward_std": 0.1722278743982315,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6272321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4044175148010254,
+      "step": 2334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1132.0,
+      "completions/max_terminated_length": 1132.0,
+      "completions/mean_length": 571.9285888671875,
+      "completions/mean_terminated_length": 571.9285888671875,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "epoch": 2.4095950477173074,
+      "grad_norm": 0.6405318379402161,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0098,
+      "num_tokens": 274453326.0,
+      "reward": 1.2607142925262451,
+      "reward_std": 0.12552452087402344,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2607142925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.3606443703174591,
+      "step": 2335
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2659.0,
+      "completions/max_terminated_length": 2659.0,
+      "completions/mean_length": 522.8035888671875,
+      "completions/mean_terminated_length": 522.8035888671875,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 2.4106267732783078,
+      "grad_norm": 0.7238879203796387,
+      "kl": 0.143310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 274580116.0,
+      "reward": 1.6142858266830444,
+      "reward_std": 0.26368361711502075,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6142857670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.4057263731956482,
+      "step": 2336
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1801.0,
+      "completions/max_terminated_length": 1801.0,
+      "completions/mean_length": 430.3035888671875,
+      "completions/mean_terminated_length": 430.3035888671875,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.4116584988393086,
+      "grad_norm": 0.5872229933738708,
+      "kl": 0.13525390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0043,
+      "num_tokens": 274694194.0,
+      "reward": 1.612053632736206,
+      "reward_std": 0.12714780867099762,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6120535731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4642268419265747,
+      "step": 2337
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1089.0,
+      "completions/max_terminated_length": 1089.0,
+      "completions/mean_length": 426.6160888671875,
+      "completions/mean_terminated_length": 426.6160888671875,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.4126902244003094,
+      "grad_norm": 1.0934064388275146,
+      "kl": 0.226806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0316,
+      "num_tokens": 274797296.0,
+      "reward": 1.6227680444717407,
+      "reward_std": 0.1348063200712204,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6227678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.42695698142051697,
+      "step": 2338
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1102.0,
+      "completions/max_terminated_length": 1102.0,
+      "completions/mean_length": 474.107177734375,
+      "completions/mean_terminated_length": 474.107177734375,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.4137219499613103,
+      "grad_norm": 0.7120774984359741,
+      "kl": 0.14697265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0417,
+      "num_tokens": 274911786.0,
+      "reward": 1.5785715579986572,
+      "reward_std": 0.14783048629760742,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5785714387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.4151258170604706,
+      "step": 2339
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1504.0,
+      "completions/max_terminated_length": 1504.0,
+      "completions/mean_length": 450.5000305175781,
+      "completions/mean_terminated_length": 450.5000305175781,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 2.414753675522311,
+      "grad_norm": 0.8808677792549133,
+      "kl": 0.142333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.045,
+      "num_tokens": 275023338.0,
+      "reward": 1.6892857551574707,
+      "reward_std": 0.17411798238754272,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6892856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.39217737317085266,
+      "step": 2340
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1393.0,
+      "completions/max_terminated_length": 1393.0,
+      "completions/mean_length": 496.9107360839844,
+      "completions/mean_terminated_length": 496.9107360839844,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 2.415785401083312,
+      "grad_norm": 0.7008398175239563,
+      "kl": 0.15185546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0185,
+      "num_tokens": 275146059.0,
+      "reward": 1.4919644594192505,
+      "reward_std": 0.2472720891237259,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5098214149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.3980152904987335,
+      "step": 2341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 955.0,
+      "completions/max_terminated_length": 955.0,
+      "completions/mean_length": 497.482177734375,
+      "completions/mean_terminated_length": 497.482177734375,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.416817126644313,
+      "grad_norm": 0.6380481123924255,
+      "kl": 0.1229248046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0137,
+      "num_tokens": 275274962.0,
+      "reward": 1.4843751192092896,
+      "reward_std": 0.13236817717552185,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4933035373687744,
+      "rewards/curriculum_aware_reward_fn/std": 0.43782302737236023,
+      "step": 2342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1041.0,
+      "completions/max_terminated_length": 1041.0,
+      "completions/mean_length": 472.9464416503906,
+      "completions/mean_terminated_length": 472.9464416503906,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.417848852205313,
+      "grad_norm": 0.7028351426124573,
+      "kl": 0.148193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0007,
+      "num_tokens": 275400914.0,
+      "reward": 1.4901787042617798,
+      "reward_std": 0.21455268561840057,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49910715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.43319380283355713,
+      "step": 2343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1112.0,
+      "completions/max_terminated_length": 1112.0,
+      "completions/mean_length": 474.2857360839844,
+      "completions/mean_terminated_length": 474.2857360839844,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.418880577766314,
+      "grad_norm": 0.6846729516983032,
+      "kl": 0.14599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0031,
+      "num_tokens": 275521360.0,
+      "reward": 1.5718752145767212,
+      "reward_std": 0.1859087347984314,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5718749761581421,
+      "rewards/curriculum_aware_reward_fn/std": 0.4332353174686432,
+      "step": 2344
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1161.0,
+      "completions/max_terminated_length": 1161.0,
+      "completions/mean_length": 459.6250305175781,
+      "completions/mean_terminated_length": 459.6250305175781,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 2.419912303327315,
+      "grad_norm": 0.8312680125236511,
+      "kl": 0.185791015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0093,
+      "num_tokens": 275645547.0,
+      "reward": 1.5767858028411865,
+      "reward_std": 0.18420013785362244,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.576785683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.5562069416046143,
+      "step": 2345
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 966.0,
+      "completions/max_terminated_length": 966.0,
+      "completions/mean_length": 474.4464416503906,
+      "completions/mean_terminated_length": 474.4464416503906,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 2.4209440288883157,
+      "grad_norm": 0.8140915632247925,
+      "kl": 0.13916015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0049,
+      "num_tokens": 275772962.0,
+      "reward": 1.4232144355773926,
+      "reward_std": 0.2549702823162079,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43214288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.40346792340278625,
+      "step": 2346
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1507.0,
+      "completions/max_terminated_length": 1507.0,
+      "completions/mean_length": 471.232177734375,
+      "completions/mean_terminated_length": 471.232177734375,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 2.4219757544493166,
+      "grad_norm": 0.6817914247512817,
+      "kl": 0.1494140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 275900150.0,
+      "reward": 1.4977679252624512,
+      "reward_std": 0.1634839028120041,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4977678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.40838003158569336,
+      "step": 2347
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1616.0,
+      "completions/max_terminated_length": 1616.0,
+      "completions/mean_length": 480.5535888671875,
+      "completions/mean_terminated_length": 480.5535888671875,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.4230074800103174,
+      "grad_norm": 0.6364054083824158,
+      "kl": 0.140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 276021613.0,
+      "reward": 1.5656249523162842,
+      "reward_std": 0.11939293891191483,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.565625011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.43687212467193604,
+      "step": 2348
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 976.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 446.9285888671875,
+      "completions/mean_terminated_length": 446.9285888671875,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 2.424039205571318,
+      "grad_norm": 0.9955868124961853,
+      "kl": 0.195556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0341,
+      "num_tokens": 276141830.0,
+      "reward": 1.5102678537368774,
+      "reward_std": 0.16214899718761444,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5102678537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.41601601243019104,
+      "step": 2349
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1011.0,
+      "completions/max_terminated_length": 1011.0,
+      "completions/mean_length": 500.83929443359375,
+      "completions/mean_terminated_length": 500.83929443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.4250709311323186,
+      "grad_norm": 0.7112178206443787,
+      "kl": 0.1337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0036,
+      "num_tokens": 276268239.0,
+      "reward": 1.4861608743667603,
+      "reward_std": 0.22006341814994812,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4861607253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.44006600975990295,
+      "step": 2350
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1270.0,
+      "completions/max_terminated_length": 1270.0,
+      "completions/mean_length": 524.5892944335938,
+      "completions/mean_terminated_length": 524.5892944335938,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 2.4261026566933195,
+      "grad_norm": 0.7407634258270264,
+      "kl": 0.13525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0056,
+      "num_tokens": 276390525.0,
+      "reward": 1.5602679252624512,
+      "reward_std": 0.28421685099601746,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5602678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.48985832929611206,
+      "step": 2351
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1226.0,
+      "completions/max_terminated_length": 1226.0,
+      "completions/mean_length": 465.4375305175781,
+      "completions/mean_terminated_length": 465.4375305175781,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.4271343822543203,
+      "grad_norm": 0.6548599600791931,
+      "kl": 0.1337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0255,
+      "num_tokens": 276516308.0,
+      "reward": 1.5254465341567993,
+      "reward_std": 0.2117275446653366,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.452769011259079,
+      "step": 2352
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1437.0,
+      "completions/max_terminated_length": 1437.0,
+      "completions/mean_length": 563.3482666015625,
+      "completions/mean_terminated_length": 563.3482666015625,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 2.428166107815321,
+      "grad_norm": 0.724729597568512,
+      "kl": 0.1287841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.034,
+      "num_tokens": 276665258.0,
+      "reward": 1.4772323369979858,
+      "reward_std": 0.23840339481830597,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4772321581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.43495306372642517,
+      "step": 2353
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1079.0,
+      "completions/max_terminated_length": 1079.0,
+      "completions/mean_length": 487.169677734375,
+      "completions/mean_terminated_length": 487.169677734375,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 2.429197833376322,
+      "grad_norm": 0.7468309998512268,
+      "kl": 0.14453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0198,
+      "num_tokens": 276783444.0,
+      "reward": 1.4058037996292114,
+      "reward_std": 0.1380435973405838,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4058035910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.35853466391563416,
+      "step": 2354
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1146.0,
+      "completions/max_terminated_length": 1146.0,
+      "completions/mean_length": 523.4017944335938,
+      "completions/mean_terminated_length": 523.4017944335938,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 2.430229558937323,
+      "grad_norm": 35.98688507080078,
+      "kl": 0.153076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0095,
+      "num_tokens": 276910938.0,
+      "reward": 1.502232313156128,
+      "reward_std": 0.25507819652557373,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5200892686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.45788684487342834,
+      "step": 2355
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 814.0,
+      "completions/max_terminated_length": 814.0,
+      "completions/mean_length": 433.4285888671875,
+      "completions/mean_terminated_length": 433.4285888671875,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 2.4312612844983237,
+      "grad_norm": 0.7806984782218933,
+      "kl": 0.143310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0096,
+      "num_tokens": 277024078.0,
+      "reward": 1.6334822177886963,
+      "reward_std": 0.18449801206588745,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6334820985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.42903560400009155,
+      "step": 2356
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1000.0,
+      "completions/max_terminated_length": 1000.0,
+      "completions/mean_length": 435.3482360839844,
+      "completions/mean_terminated_length": 435.3482360839844,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 2.432293010059324,
+      "grad_norm": 0.7690462470054626,
+      "kl": 0.187255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.007,
+      "num_tokens": 277130160.0,
+      "reward": 1.557142972946167,
+      "reward_std": 0.16556496918201447,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5660714507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.43827131390571594,
+      "step": 2357
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1043.0,
+      "completions/max_terminated_length": 1043.0,
+      "completions/mean_length": 449.65179443359375,
+      "completions/mean_terminated_length": 449.65179443359375,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 2.433324735620325,
+      "grad_norm": 0.721041738986969,
+      "kl": 0.14208984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0062,
+      "num_tokens": 277247707.0,
+      "reward": 1.5343750715255737,
+      "reward_std": 0.22333721816539764,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5433036088943481,
+      "rewards/curriculum_aware_reward_fn/std": 0.5206326246261597,
+      "step": 2358
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1342.0,
+      "completions/max_terminated_length": 1342.0,
+      "completions/mean_length": 486.45538330078125,
+      "completions/mean_terminated_length": 486.45538330078125,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 2.4343564611813258,
+      "grad_norm": 0.7117599248886108,
+      "kl": 0.1356201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0165,
+      "num_tokens": 277373856.0,
+      "reward": 1.5066964626312256,
+      "reward_std": 0.18413867056369781,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5066964030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.4238123595714569,
+      "step": 2359
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1196.0,
+      "completions/max_terminated_length": 1196.0,
+      "completions/mean_length": 497.33929443359375,
+      "completions/mean_terminated_length": 497.33929443359375,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 2.4353881867423266,
+      "grad_norm": 0.7181190252304077,
+      "kl": 0.14306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 277501133.0,
+      "reward": 1.4875000715255737,
+      "reward_std": 0.22479689121246338,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49642857909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.47309190034866333,
+      "step": 2360
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1140.0,
+      "completions/max_terminated_length": 1140.0,
+      "completions/mean_length": 494.2589416503906,
+      "completions/mean_terminated_length": 494.2589416503906,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 2.4364199123033274,
+      "grad_norm": 0.6397473216056824,
+      "kl": 0.1258544921875,
+      "learning_rate": 1e-06,
+      "loss": 0.038,
+      "num_tokens": 277615290.0,
+      "reward": 1.5732144117355347,
+      "reward_std": 0.1664787083864212,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5732142329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.4036393165588379,
+      "step": 2361
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1095.0,
+      "completions/max_terminated_length": 1095.0,
+      "completions/mean_length": 514.8660888671875,
+      "completions/mean_terminated_length": 514.8660888671875,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 2.4374516378643283,
+      "grad_norm": 0.6321114301681519,
+      "kl": 0.1474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0108,
+      "num_tokens": 277741996.0,
+      "reward": 1.4080358743667603,
+      "reward_std": 0.15648870170116425,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41696426272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.41457420587539673,
+      "step": 2362
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2378.0,
+      "completions/max_terminated_length": 2378.0,
+      "completions/mean_length": 497.26788330078125,
+      "completions/mean_terminated_length": 497.26788330078125,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.4384833634253287,
+      "grad_norm": 0.6805658340454102,
+      "kl": 0.140869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0307,
+      "num_tokens": 277872051.0,
+      "reward": 1.441517949104309,
+      "reward_std": 0.14166250824928284,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45044639706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.46501457691192627,
+      "step": 2363
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1578.0,
+      "completions/mean_length": 484.419677734375,
+      "completions/mean_terminated_length": 451.8829040527344,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 2.4395150889863295,
+      "grad_norm": 0.6962588429450989,
+      "kl": 0.1448974609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0474,
+      "num_tokens": 277987206.0,
+      "reward": 1.579464316368103,
+      "reward_std": 0.22141891717910767,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5883928537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.4113799035549164,
+      "step": 2364
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1429.0,
+      "completions/max_terminated_length": 1429.0,
+      "completions/mean_length": 465.5982360839844,
+      "completions/mean_terminated_length": 465.5982360839844,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.4405468145473304,
+      "grad_norm": 0.6728730201721191,
+      "kl": 0.135009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.015,
+      "num_tokens": 278115009.0,
+      "reward": 1.516964316368103,
+      "reward_std": 0.1518820971250534,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.516964316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.4160926043987274,
+      "step": 2365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 987.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 465.0089416503906,
+      "completions/mean_terminated_length": 465.0089416503906,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.441578540108331,
+      "grad_norm": 0.6624149084091187,
+      "kl": 0.144287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0286,
+      "num_tokens": 278229061.0,
+      "reward": 1.6191965341567993,
+      "reward_std": 0.20879162847995758,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.628125011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.40519049763679504,
+      "step": 2366
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 954.0,
+      "completions/max_terminated_length": 954.0,
+      "completions/mean_length": 486.76788330078125,
+      "completions/mean_terminated_length": 486.76788330078125,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 2.442610265669332,
+      "grad_norm": 0.8079473376274109,
+      "kl": 0.131591796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0174,
+      "num_tokens": 278355721.0,
+      "reward": 1.419196605682373,
+      "reward_std": 0.20018258690834045,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4281249940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.40424442291259766,
+      "step": 2367
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 923.0,
+      "completions/max_terminated_length": 923.0,
+      "completions/mean_length": 489.20538330078125,
+      "completions/mean_terminated_length": 489.20538330078125,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 2.443641991230333,
+      "grad_norm": 0.7336907386779785,
+      "kl": 0.138671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0086,
+      "num_tokens": 278486189.0,
+      "reward": 1.458035945892334,
+      "reward_std": 0.1761673539876938,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4758928418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4279630184173584,
+      "step": 2368
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1041.0,
+      "completions/max_terminated_length": 1041.0,
+      "completions/mean_length": 445.9375305175781,
+      "completions/mean_terminated_length": 445.9375305175781,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.4446737167913337,
+      "grad_norm": 0.582846462726593,
+      "kl": 0.1337890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0132,
+      "num_tokens": 278600639.0,
+      "reward": 1.51160728931427,
+      "reward_std": 0.11233289539813995,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5205357670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.475553423166275,
+      "step": 2369
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1211.0,
+      "completions/max_terminated_length": 1211.0,
+      "completions/mean_length": 478.08929443359375,
+      "completions/mean_terminated_length": 478.08929443359375,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 2.445705442352334,
+      "grad_norm": 0.7559561133384705,
+      "kl": 0.1390380859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0368,
+      "num_tokens": 278729491.0,
+      "reward": 1.567410945892334,
+      "reward_std": 0.19153565168380737,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5674106478691101,
+      "rewards/curriculum_aware_reward_fn/std": 0.3921360671520233,
+      "step": 2370
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1568.0,
+      "completions/max_terminated_length": 1568.0,
+      "completions/mean_length": 461.4285888671875,
+      "completions/mean_terminated_length": 461.4285888671875,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.446737167913335,
+      "grad_norm": 0.6146445274353027,
+      "kl": 0.141357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0185,
+      "num_tokens": 278847293.0,
+      "reward": 1.6013394594192505,
+      "reward_std": 0.2132793515920639,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6191964149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.427555650472641,
+      "step": 2371
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1367.0,
+      "completions/max_terminated_length": 1367.0,
+      "completions/mean_length": 500.2500305175781,
+      "completions/mean_terminated_length": 500.2500305175781,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 2.447768893474336,
+      "grad_norm": 0.7753989100456238,
+      "kl": 0.135498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0414,
+      "num_tokens": 278971613.0,
+      "reward": 1.417410969734192,
+      "reward_std": 0.2362671196460724,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3869282007217407,
+      "step": 2372
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 819.0,
+      "completions/max_terminated_length": 819.0,
+      "completions/mean_length": 443.6875305175781,
+      "completions/mean_terminated_length": 443.6875305175781,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 2.4488006190353366,
+      "grad_norm": 0.715697169303894,
+      "kl": 0.137451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 279092664.0,
+      "reward": 1.5709823369979858,
+      "reward_std": 0.1610252857208252,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5709820985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.43103888630867004,
+      "step": 2373
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 931.0,
+      "completions/max_terminated_length": 931.0,
+      "completions/mean_length": 417.5000305175781,
+      "completions/mean_terminated_length": 417.5000305175781,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 2.4498323445963375,
+      "grad_norm": 0.753842294216156,
+      "kl": 0.13525390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0014,
+      "num_tokens": 279208038.0,
+      "reward": 1.3986607789993286,
+      "reward_std": 0.10415496677160263,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39866071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.410199910402298,
+      "step": 2374
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1408.0,
+      "completions/max_terminated_length": 1408.0,
+      "completions/mean_length": 425.1339416503906,
+      "completions/mean_terminated_length": 425.1339416503906,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 2.4508640701573383,
+      "grad_norm": 0.8698515892028809,
+      "kl": 0.14306640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0085,
+      "num_tokens": 279316308.0,
+      "reward": 1.5468751192092896,
+      "reward_std": 0.24297165870666504,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5558035969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.42509910464286804,
+      "step": 2375
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 904.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 442.8750305175781,
+      "completions/mean_terminated_length": 442.8750305175781,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.4518957957183387,
+      "grad_norm": 0.8830894231796265,
+      "kl": 0.142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0269,
+      "num_tokens": 279423877.0,
+      "reward": 1.4602681398391724,
+      "reward_std": 0.24288131296634674,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4691964089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.4479304552078247,
+      "step": 2376
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1137.0,
+      "completions/max_terminated_length": 1137.0,
+      "completions/mean_length": 447.3482360839844,
+      "completions/mean_terminated_length": 447.3482360839844,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 2.4529275212793396,
+      "grad_norm": 0.8079304099082947,
+      "kl": 0.155029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0289,
+      "num_tokens": 279536189.0,
+      "reward": 1.4299107789993286,
+      "reward_std": 0.21381406486034393,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43883928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.38035720586776733,
+      "step": 2377
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1028.0,
+      "completions/max_terminated_length": 1028.0,
+      "completions/mean_length": 421.3571472167969,
+      "completions/mean_terminated_length": 421.3571472167969,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 2.4539592468403404,
+      "grad_norm": 0.7308657169342041,
+      "kl": 0.14208984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0007,
+      "num_tokens": 279644204.0,
+      "reward": 1.6513394117355347,
+      "reward_std": 0.11506469547748566,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6513392329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.4083288013935089,
+      "step": 2378
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1154.0,
+      "completions/max_terminated_length": 1154.0,
+      "completions/mean_length": 445.1339416503906,
+      "completions/mean_terminated_length": 445.1339416503906,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 2.4549909724013412,
+      "grad_norm": 0.7689999938011169,
+      "kl": 0.13427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0103,
+      "num_tokens": 279760293.0,
+      "reward": 1.5821430683135986,
+      "reward_std": 0.2600085437297821,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5821428298950195,
+      "rewards/curriculum_aware_reward_fn/std": 0.447659432888031,
+      "step": 2379
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 788.0,
+      "completions/max_terminated_length": 788.0,
+      "completions/mean_length": 417.7589416503906,
+      "completions/mean_terminated_length": 417.7589416503906,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.456022697962342,
+      "grad_norm": 0.8615774512290955,
+      "kl": 0.135009765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0033,
+      "num_tokens": 279873498.0,
+      "reward": 1.5763394832611084,
+      "reward_std": 0.3193800449371338,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5763392448425293,
+      "rewards/curriculum_aware_reward_fn/std": 0.41388946771621704,
+      "step": 2380
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1015.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 427.1696472167969,
+      "completions/mean_terminated_length": 427.1696472167969,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.457054423523343,
+      "grad_norm": 0.7411530017852783,
+      "kl": 0.1309814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0187,
+      "num_tokens": 279988275.0,
+      "reward": 1.589732050895691,
+      "reward_std": 0.26775115728378296,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5897321701049805,
+      "rewards/curriculum_aware_reward_fn/std": 0.44418859481811523,
+      "step": 2381
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 685.0,
+      "completions/max_terminated_length": 685.0,
+      "completions/mean_length": 375.2321472167969,
+      "completions/mean_terminated_length": 375.2321472167969,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 2.4580861490843438,
+      "grad_norm": 1.1352235078811646,
+      "kl": 0.1357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0148,
+      "num_tokens": 280086980.0,
+      "reward": 1.5687501430511475,
+      "reward_std": 0.12106120586395264,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5776785612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.44331061840057373,
+      "step": 2382
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1090.0,
+      "completions/max_terminated_length": 1090.0,
+      "completions/mean_length": 432.3035888671875,
+      "completions/mean_terminated_length": 432.3035888671875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 2.459117874645344,
+      "grad_norm": 0.8991947174072266,
+      "kl": 0.1484375,
+      "learning_rate": 1e-06,
+      "loss": -0.002,
+      "num_tokens": 280205440.0,
+      "reward": 1.6343752145767212,
+      "reward_std": 0.2517082095146179,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6343750357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.46075883507728577,
+      "step": 2383
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 944.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 425.6339416503906,
+      "completions/mean_terminated_length": 425.6339416503906,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 2.460149600206345,
+      "grad_norm": 0.8231422901153564,
+      "kl": 0.145751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 280310224.0,
+      "reward": 1.403571605682373,
+      "reward_std": 0.18607930839061737,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4035714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.4063524007797241,
+      "step": 2384
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1028.0,
+      "completions/max_terminated_length": 1028.0,
+      "completions/mean_length": 449.0535888671875,
+      "completions/mean_terminated_length": 449.0535888671875,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 2.461181325767346,
+      "grad_norm": 0.6670386791229248,
+      "kl": 0.1282958984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0036,
+      "num_tokens": 280424904.0,
+      "reward": 1.4799107313156128,
+      "reward_std": 0.14945851266384125,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4799107015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.454529732465744,
+      "step": 2385
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 864.0,
+      "completions/max_terminated_length": 864.0,
+      "completions/mean_length": 454.0714416503906,
+      "completions/mean_terminated_length": 454.0714416503906,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.4622130513283467,
+      "grad_norm": 0.8334097862243652,
+      "kl": 0.130126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0021,
+      "num_tokens": 280545515.0,
+      "reward": 1.5950894355773926,
+      "reward_std": 0.2628384232521057,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.595089316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.3990393877029419,
+      "step": 2386
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 413.95538330078125,
+      "completions/mean_terminated_length": 413.95538330078125,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 2.4632447768893475,
+      "grad_norm": 0.7344523668289185,
+      "kl": 0.14404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0226,
+      "num_tokens": 280668934.0,
+      "reward": 1.611607313156128,
+      "reward_std": 0.16572576761245728,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6205357313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.3946462571620941,
+      "step": 2387
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1023.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 449.8839416503906,
+      "completions/mean_terminated_length": 449.8839416503906,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.4642765024503483,
+      "grad_norm": 0.7916790246963501,
+      "kl": 0.13818359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0004,
+      "num_tokens": 280786059.0,
+      "reward": 1.4486607313156128,
+      "reward_std": 0.20076420903205872,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4486607015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.4619513750076294,
+      "step": 2388
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 889.0,
+      "completions/max_terminated_length": 889.0,
+      "completions/mean_length": 442.3750305175781,
+      "completions/mean_terminated_length": 442.3750305175781,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "epoch": 2.4653082280113487,
+      "grad_norm": 0.8291255831718445,
+      "kl": 0.14404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 280893929.0,
+      "reward": 1.583482265472412,
+      "reward_std": 0.2577275037765503,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.592410683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.41078969836235046,
+      "step": 2389
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 847.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 421.89288330078125,
+      "completions/mean_terminated_length": 421.89288330078125,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.4663399535723496,
+      "grad_norm": 1.1254520416259766,
+      "kl": 0.280517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0058,
+      "num_tokens": 281002788.0,
+      "reward": 1.4928573369979858,
+      "reward_std": 0.20928914844989777,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4928571879863739,
+      "rewards/curriculum_aware_reward_fn/std": 0.41230276226997375,
+      "step": 2390
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 888.0,
+      "completions/max_terminated_length": 888.0,
+      "completions/mean_length": 432.3035888671875,
+      "completions/mean_terminated_length": 432.3035888671875,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.4673716791333504,
+      "grad_norm": 0.7715088129043579,
+      "kl": 0.136962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 281116936.0,
+      "reward": 1.5491071939468384,
+      "reward_std": 0.2484361082315445,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5491071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.42170727252960205,
+      "step": 2391
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1027.0,
+      "completions/max_terminated_length": 1027.0,
+      "completions/mean_length": 440.95538330078125,
+      "completions/mean_terminated_length": 440.95538330078125,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.4684034046943513,
+      "grad_norm": 0.8398274779319763,
+      "kl": 0.14990234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 281242924.0,
+      "reward": 1.518303632736206,
+      "reward_std": 0.2738878130912781,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5272321701049805,
+      "rewards/curriculum_aware_reward_fn/std": 0.4173537790775299,
+      "step": 2392
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1224.0,
+      "completions/max_terminated_length": 1224.0,
+      "completions/mean_length": 455.232177734375,
+      "completions/mean_terminated_length": 455.232177734375,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 2.469435130255352,
+      "grad_norm": 0.9072141647338867,
+      "kl": 0.1353759765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0109,
+      "num_tokens": 281359521.0,
+      "reward": 1.5566965341567993,
+      "reward_std": 0.32403483986854553,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5745536088943481,
+      "rewards/curriculum_aware_reward_fn/std": 0.433090478181839,
+      "step": 2393
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1093.0,
+      "completions/max_terminated_length": 1093.0,
+      "completions/mean_length": 489.46429443359375,
+      "completions/mean_terminated_length": 489.46429443359375,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.470466855816353,
+      "grad_norm": 0.7404239177703857,
+      "kl": 0.125732421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0239,
+      "num_tokens": 281496549.0,
+      "reward": 1.5584824085235596,
+      "reward_std": 0.21812358498573303,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5584821105003357,
+      "rewards/curriculum_aware_reward_fn/std": 0.44209206104278564,
+      "step": 2394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1078.0,
+      "completions/max_terminated_length": 1078.0,
+      "completions/mean_length": 431.4107360839844,
+      "completions/mean_terminated_length": 431.4107360839844,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.471498581377354,
+      "grad_norm": 0.7665273547172546,
+      "kl": 0.13427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 281613684.0,
+      "reward": 1.3875001668930054,
+      "reward_std": 0.22730781137943268,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38750001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.4318147301673889,
+      "step": 2395
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 445.9285888671875,
+      "completions/mean_terminated_length": 445.9285888671875,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 2.4725303069383546,
+      "grad_norm": 0.8466961979866028,
+      "kl": 0.140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0086,
+      "num_tokens": 281728119.0,
+      "reward": 1.653571605682373,
+      "reward_std": 0.248878613114357,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6714285612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.6378000974655151,
+      "step": 2396
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 897.0,
+      "completions/max_terminated_length": 897.0,
+      "completions/mean_length": 439.6339416503906,
+      "completions/mean_terminated_length": 439.6339416503906,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 2.473562032499355,
+      "grad_norm": 0.7907005548477173,
+      "kl": 0.1319580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 281841170.0,
+      "reward": 1.4718750715255737,
+      "reward_std": 0.2342146635055542,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48080354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.4226815402507782,
+      "step": 2397
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 978.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 465.857177734375,
+      "completions/mean_terminated_length": 465.857177734375,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 2.474593758060356,
+      "grad_norm": 0.8809310793876648,
+      "kl": 0.13330078125,
+      "learning_rate": 1e-06,
+      "loss": -0.019,
+      "num_tokens": 281958124.0,
+      "reward": 1.5223214626312256,
+      "reward_std": 0.2313743233680725,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.53125,
+      "rewards/curriculum_aware_reward_fn/std": 0.4156360924243927,
+      "step": 2398
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 432.08929443359375,
+      "completions/mean_terminated_length": 432.08929443359375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 2.4756254836213567,
+      "grad_norm": 0.7881631851196289,
+      "kl": 0.13671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0186,
+      "num_tokens": 282073634.0,
+      "reward": 1.4223215579986572,
+      "reward_std": 0.14511245489120483,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4312500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.4221419394016266,
+      "step": 2399
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 999.0,
+      "completions/max_terminated_length": 999.0,
+      "completions/mean_length": 484.9732360839844,
+      "completions/mean_terminated_length": 484.9732360839844,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.4766572091823575,
+      "grad_norm": 0.6421279907226562,
+      "kl": 0.123291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.011,
+      "num_tokens": 282194528.0,
+      "reward": 1.438839316368103,
+      "reward_std": 0.1716233789920807,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45669645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.42661961913108826,
+      "step": 2400
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1000.0,
+      "completions/max_terminated_length": 1000.0,
+      "completions/mean_length": 426.3035888671875,
+      "completions/mean_terminated_length": 426.3035888671875,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.4776889347433584,
+      "grad_norm": 0.9702169299125671,
+      "kl": 0.143310546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0101,
+      "num_tokens": 282310111.0,
+      "reward": 1.5482145547866821,
+      "reward_std": 0.2584778964519501,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5660713911056519,
+      "rewards/curriculum_aware_reward_fn/std": 0.42358478903770447,
+      "step": 2401
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1197.0,
+      "completions/max_terminated_length": 1197.0,
+      "completions/mean_length": 476.4107360839844,
+      "completions/mean_terminated_length": 476.4107360839844,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.4787206603043592,
+      "grad_norm": 0.7732918858528137,
+      "kl": 0.127197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 282427785.0,
+      "reward": 1.4455358982086182,
+      "reward_std": 0.1627698689699173,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.43653804063796997,
+      "step": 2402
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 895.0,
+      "completions/max_terminated_length": 895.0,
+      "completions/mean_length": 429.01788330078125,
+      "completions/mean_terminated_length": 429.01788330078125,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.4797523858653596,
+      "grad_norm": 0.8249169588088989,
+      "kl": 0.17626953125,
+      "learning_rate": 1e-06,
+      "loss": -0.001,
+      "num_tokens": 282535895.0,
+      "reward": 1.6513392925262451,
+      "reward_std": 0.1300283670425415,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6513392329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.5273364782333374,
+      "step": 2403
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1156.0,
+      "completions/max_terminated_length": 1156.0,
+      "completions/mean_length": 444.544677734375,
+      "completions/mean_terminated_length": 444.544677734375,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 2.4807841114263605,
+      "grad_norm": 0.9494808912277222,
+      "kl": 0.152099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 282659443.0,
+      "reward": 1.4933037757873535,
+      "reward_std": 0.28490185737609863,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4933035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.3912612795829773,
+      "step": 2404
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1589.0,
+      "completions/max_terminated_length": 1589.0,
+      "completions/mean_length": 430.3571472167969,
+      "completions/mean_terminated_length": 430.3571472167969,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 2.4818158369873613,
+      "grad_norm": 0.8510450720787048,
+      "kl": 0.13623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 282770594.0,
+      "reward": 1.5924108028411865,
+      "reward_std": 0.29261523485183716,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6013392806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.4421721398830414,
+      "step": 2405
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1464.0,
+      "completions/max_terminated_length": 1464.0,
+      "completions/mean_length": 467.9910888671875,
+      "completions/mean_terminated_length": 467.9910888671875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 2.482847562548362,
+      "grad_norm": 0.6033523082733154,
+      "kl": 0.1334228515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0172,
+      "num_tokens": 282887346.0,
+      "reward": 1.3660714626312256,
+      "reward_std": 0.18522292375564575,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3660714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.5164507627487183,
+      "step": 2406
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 784.0,
+      "completions/max_terminated_length": 784.0,
+      "completions/mean_length": 450.232177734375,
+      "completions/mean_terminated_length": 450.232177734375,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.483879288109363,
+      "grad_norm": 0.789675235748291,
+      "kl": 0.138916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 283001570.0,
+      "reward": 1.443750023841858,
+      "reward_std": 0.2680908441543579,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4437499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.43924200534820557,
+      "step": 2407
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1272.0,
+      "completions/max_terminated_length": 1272.0,
+      "completions/mean_length": 467.9107360839844,
+      "completions/mean_terminated_length": 467.9107360839844,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 2.484911013670364,
+      "grad_norm": 0.7835040092468262,
+      "kl": 0.138427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0191,
+      "num_tokens": 283118418.0,
+      "reward": 1.403571605682373,
+      "reward_std": 0.20980599522590637,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41249996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.38651248812675476,
+      "step": 2408
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 935.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 451.2500305175781,
+      "completions/mean_terminated_length": 451.2500305175781,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 2.4859427392313647,
+      "grad_norm": 0.8104711174964905,
+      "kl": 0.1455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 283243452.0,
+      "reward": 1.5383929014205933,
+      "reward_std": 0.20198899507522583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5383928418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4044227600097656,
+      "step": 2409
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1491.0,
+      "completions/max_terminated_length": 1491.0,
+      "completions/mean_length": 418.6964416503906,
+      "completions/mean_terminated_length": 418.6964416503906,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 2.486974464792365,
+      "grad_norm": 0.808778703212738,
+      "kl": 0.152099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0456,
+      "num_tokens": 283359730.0,
+      "reward": 1.5857144594192505,
+      "reward_std": 0.2007414996623993,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5857142806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.44440609216690063,
+      "step": 2410
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1139.0,
+      "completions/max_terminated_length": 1139.0,
+      "completions/mean_length": 427.5089416503906,
+      "completions/mean_terminated_length": 427.5089416503906,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 2.488006190353366,
+      "grad_norm": 0.7132419347763062,
+      "kl": 0.1376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0038,
+      "num_tokens": 283475462.0,
+      "reward": 1.547767996788025,
+      "reward_std": 0.1488623172044754,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5477678179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.4024356007575989,
+      "step": 2411
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1545.0,
+      "completions/max_terminated_length": 1545.0,
+      "completions/mean_length": 437.08038330078125,
+      "completions/mean_terminated_length": 437.08038330078125,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.4890379159143667,
+      "grad_norm": 0.8942745327949524,
+      "kl": 0.151611328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0032,
+      "num_tokens": 283592777.0,
+      "reward": 1.534821629524231,
+      "reward_std": 0.21150675415992737,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.543749988079071,
+      "rewards/curriculum_aware_reward_fn/std": 0.4239520728588104,
+      "step": 2412
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 995.0,
+      "completions/max_terminated_length": 995.0,
+      "completions/mean_length": 461.1964416503906,
+      "completions/mean_terminated_length": 461.1964416503906,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 2.4900696414753676,
+      "grad_norm": 0.8201280832290649,
+      "kl": 0.1416015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0483,
+      "num_tokens": 283717882.0,
+      "reward": 1.479017972946167,
+      "reward_std": 0.2597033679485321,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49687501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.3952009975910187,
+      "step": 2413
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 898.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 419.0535888671875,
+      "completions/mean_terminated_length": 419.0535888671875,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.4911013670363684,
+      "grad_norm": 0.7921744585037231,
+      "kl": 0.14111328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0043,
+      "num_tokens": 283833731.0,
+      "reward": 1.5223214626312256,
+      "reward_std": 0.21495002508163452,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.53125,
+      "rewards/curriculum_aware_reward_fn/std": 0.43262892961502075,
+      "step": 2414
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1020.0,
+      "completions/max_terminated_length": 1020.0,
+      "completions/mean_length": 458.232177734375,
+      "completions/mean_terminated_length": 458.232177734375,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 2.4921330925973693,
+      "grad_norm": 0.921675443649292,
+      "kl": 0.17626953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0137,
+      "num_tokens": 283951714.0,
+      "reward": 1.4861608743667603,
+      "reward_std": 0.32043954730033875,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5129464268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.41713014245033264,
+      "step": 2415
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1056.0,
+      "completions/max_terminated_length": 1056.0,
+      "completions/mean_length": 422.0714416503906,
+      "completions/mean_terminated_length": 422.0714416503906,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 2.4931648181583697,
+      "grad_norm": 0.8260616660118103,
+      "kl": 0.156982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0261,
+      "num_tokens": 284070827.0,
+      "reward": 1.5308037996292114,
+      "reward_std": 0.16464564204216003,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5308035612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.3940003514289856,
+      "step": 2416
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2222.0,
+      "completions/max_terminated_length": 2222.0,
+      "completions/mean_length": 461.3660888671875,
+      "completions/mean_terminated_length": 461.3660888671875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.4941965437193705,
+      "grad_norm": 0.6988282203674316,
+      "kl": 0.14208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 284192528.0,
+      "reward": 1.4892858266830444,
+      "reward_std": 0.21458344161510468,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4982142746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.42223817110061646,
+      "step": 2417
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 992.0,
+      "completions/max_terminated_length": 992.0,
+      "completions/mean_length": 429.95538330078125,
+      "completions/mean_terminated_length": 429.95538330078125,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 2.4952282692803713,
+      "grad_norm": 0.8364533185958862,
+      "kl": 0.149169921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0109,
+      "num_tokens": 284310942.0,
+      "reward": 1.4254463911056519,
+      "reward_std": 0.2122143805027008,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43437501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.37296178936958313,
+      "step": 2418
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1056.0,
+      "completions/max_terminated_length": 1056.0,
+      "completions/mean_length": 425.7321472167969,
+      "completions/mean_terminated_length": 425.7321472167969,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.496259994841372,
+      "grad_norm": 0.8200624585151672,
+      "kl": 0.158447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0135,
+      "num_tokens": 284422436.0,
+      "reward": 1.5343750715255737,
+      "reward_std": 0.2781384289264679,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5433035492897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.49408644437789917,
+      "step": 2419
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1269.0,
+      "completions/max_terminated_length": 1269.0,
+      "completions/mean_length": 376.5982360839844,
+      "completions/mean_terminated_length": 376.5982360839844,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 2.497291720402373,
+      "grad_norm": 0.8670206665992737,
+      "kl": 0.1552734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0113,
+      "num_tokens": 284521266.0,
+      "reward": 1.6232143640518188,
+      "reward_std": 0.2606101334095001,
+      "rewards/code_format_reward/mean": 0.9642857313156128,
+      "rewards/code_format_reward/std": 0.18641091883182526,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6589285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.40859097242355347,
+      "step": 2420
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 907.0,
+      "completions/max_terminated_length": 907.0,
+      "completions/mean_length": 427.2321472167969,
+      "completions/mean_terminated_length": 427.2321472167969,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 2.498323445963374,
+      "grad_norm": 0.7723448276519775,
+      "kl": 0.130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0047,
+      "num_tokens": 284639788.0,
+      "reward": 1.6361607313156128,
+      "reward_std": 0.214374378323555,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6361607313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.3943982720375061,
+      "step": 2421
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1003.0,
+      "completions/max_terminated_length": 1003.0,
+      "completions/mean_length": 492.1607360839844,
+      "completions/mean_terminated_length": 492.1607360839844,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.4993551715243747,
+      "grad_norm": 0.7836470007896423,
+      "kl": 0.144287109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0216,
+      "num_tokens": 284772645.0,
+      "reward": 1.3093751668930054,
+      "reward_std": 0.20374557375907898,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31830358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.4010681211948395,
+      "step": 2422
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 805.0,
+      "completions/max_terminated_length": 805.0,
+      "completions/mean_length": 397.4375305175781,
+      "completions/mean_terminated_length": 397.4375305175781,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.5003868970853755,
+      "grad_norm": 0.9490916728973389,
+      "kl": 0.1904296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0072,
+      "num_tokens": 284877295.0,
+      "reward": 1.6299108266830444,
+      "reward_std": 0.20785073935985565,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6299107670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.39589446783065796,
+      "step": 2423
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1001.0,
+      "completions/max_terminated_length": 1001.0,
+      "completions/mean_length": 440.0089416503906,
+      "completions/mean_terminated_length": 440.0089416503906,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.501418622646376,
+      "grad_norm": 0.8236626982688904,
+      "kl": 0.145751953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0189,
+      "num_tokens": 284993398.0,
+      "reward": 1.4593751430511475,
+      "reward_std": 0.22319842875003815,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4593749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.41814321279525757,
+      "step": 2424
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 826.0,
+      "completions/max_terminated_length": 826.0,
+      "completions/mean_length": 398.21429443359375,
+      "completions/mean_terminated_length": 398.21429443359375,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.5024503482073768,
+      "grad_norm": 0.7493698596954346,
+      "kl": 0.166259765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 285108005.0,
+      "reward": 1.4629465341567993,
+      "reward_std": 0.14567653834819794,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47187498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.42879369854927063,
+      "step": 2425
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1229.0,
+      "completions/max_terminated_length": 1229.0,
+      "completions/mean_length": 446.044677734375,
+      "completions/mean_terminated_length": 446.044677734375,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.5034820737683776,
+      "grad_norm": 0.6787101030349731,
+      "kl": 0.140869140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0293,
+      "num_tokens": 285226982.0,
+      "reward": 1.5388394594192505,
+      "reward_std": 0.16813233494758606,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5388392806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.3883848488330841,
+      "step": 2426
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 845.0,
+      "completions/max_terminated_length": 845.0,
+      "completions/mean_length": 415.8660888671875,
+      "completions/mean_terminated_length": 415.8660888671875,
+      "completions/min_length": 157.0,
+      "completions/min_terminated_length": 157.0,
+      "epoch": 2.5045137993293785,
+      "grad_norm": 0.7325301170349121,
+      "kl": 0.13818359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0158,
+      "num_tokens": 285345491.0,
+      "reward": 1.6705358028411865,
+      "reward_std": 0.1813599020242691,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6794642806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.37534090876579285,
+      "step": 2427
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 823.0,
+      "completions/max_terminated_length": 823.0,
+      "completions/mean_length": 425.9285888671875,
+      "completions/mean_terminated_length": 425.9285888671875,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 2.5055455248903793,
+      "grad_norm": 0.8319756388664246,
+      "kl": 0.15673828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0032,
+      "num_tokens": 285452198.0,
+      "reward": 1.4526787996292114,
+      "reward_std": 0.22211024165153503,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45267853140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.4158373177051544,
+      "step": 2428
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1047.0,
+      "completions/max_terminated_length": 1047.0,
+      "completions/mean_length": 481.1875305175781,
+      "completions/mean_terminated_length": 481.1875305175781,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 2.5065772504513797,
+      "grad_norm": 0.8352397680282593,
+      "kl": 0.150146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 285573018.0,
+      "reward": 1.540178656578064,
+      "reward_std": 0.23180150985717773,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5401785969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.40206894278526306,
+      "step": 2429
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1052.0,
+      "completions/max_terminated_length": 1052.0,
+      "completions/mean_length": 447.5625305175781,
+      "completions/mean_terminated_length": 447.5625305175781,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 2.5076089760123805,
+      "grad_norm": 0.7682133913040161,
+      "kl": 0.15185546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0143,
+      "num_tokens": 285694041.0,
+      "reward": 1.69910728931427,
+      "reward_std": 0.17765921354293823,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6991071701049805,
+      "rewards/curriculum_aware_reward_fn/std": 0.3511873185634613,
+      "step": 2430
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 730.0,
+      "completions/max_terminated_length": 730.0,
+      "completions/mean_length": 389.15179443359375,
+      "completions/mean_terminated_length": 389.15179443359375,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 2.5086407015733814,
+      "grad_norm": 0.8312105536460876,
+      "kl": 0.150390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0124,
+      "num_tokens": 285799996.0,
+      "reward": 1.6107144355773926,
+      "reward_std": 0.2674295902252197,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.610714316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.40260574221611023,
+      "step": 2431
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1022.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 460.5714416503906,
+      "completions/mean_terminated_length": 460.5714416503906,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 2.509672427134382,
+      "grad_norm": 0.7323674559593201,
+      "kl": 0.161376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0303,
+      "num_tokens": 285923855.0,
+      "reward": 1.4986608028411865,
+      "reward_std": 0.24598772823810577,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49866071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4470856785774231,
+      "step": 2432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1230.0,
+      "completions/max_terminated_length": 1230.0,
+      "completions/mean_length": 438.83929443359375,
+      "completions/mean_terminated_length": 438.83929443359375,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 2.510704152695383,
+      "grad_norm": 0.7977431416511536,
+      "kl": 0.148681640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0007,
+      "num_tokens": 286039414.0,
+      "reward": 1.3580358028411865,
+      "reward_std": 0.14513644576072693,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35803571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.40794771909713745,
+      "step": 2433
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1096.0,
+      "completions/max_terminated_length": 1096.0,
+      "completions/mean_length": 446.044677734375,
+      "completions/mean_terminated_length": 446.044677734375,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.511735878256384,
+      "grad_norm": 0.7356109619140625,
+      "kl": 0.137451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0015,
+      "num_tokens": 286156312.0,
+      "reward": 1.5607144832611084,
+      "reward_std": 0.2145209014415741,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5696428418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4123300611972809,
+      "step": 2434
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1300.0,
+      "completions/max_terminated_length": 1300.0,
+      "completions/mean_length": 472.4375305175781,
+      "completions/mean_terminated_length": 472.4375305175781,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.5127676038173847,
+      "grad_norm": 0.8715661764144897,
+      "kl": 0.1484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0146,
+      "num_tokens": 286283831.0,
+      "reward": 1.4593751430511475,
+      "reward_std": 0.21618635952472687,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4683035910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.36440718173980713,
+      "step": 2435
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1116.0,
+      "completions/max_terminated_length": 1116.0,
+      "completions/mean_length": 444.3750305175781,
+      "completions/mean_terminated_length": 444.3750305175781,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 2.5137993293783856,
+      "grad_norm": 0.7075579166412354,
+      "kl": 0.14892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 286396454.0,
+      "reward": 1.4535716772079468,
+      "reward_std": 0.16791139543056488,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46250003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.40711575746536255,
+      "step": 2436
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3690.0,
+      "completions/max_terminated_length": 3690.0,
+      "completions/mean_length": 547.419677734375,
+      "completions/mean_terminated_length": 547.419677734375,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 2.514831054939386,
+      "grad_norm": 0.8053109645843506,
+      "kl": 0.146240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0122,
+      "num_tokens": 286530754.0,
+      "reward": 1.5142858028411865,
+      "reward_std": 0.3455066978931427,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.514285683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.4575900733470917,
+      "step": 2437
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1186.0,
+      "completions/max_terminated_length": 1186.0,
+      "completions/mean_length": 494.3482360839844,
+      "completions/mean_terminated_length": 494.3482360839844,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 2.515862780500387,
+      "grad_norm": 0.6410834789276123,
+      "kl": 0.15087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 286652124.0,
+      "reward": 1.3754465579986572,
+      "reward_std": 0.16579985618591309,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3754464089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.4305347502231598,
+      "step": 2438
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 907.0,
+      "completions/max_terminated_length": 907.0,
+      "completions/mean_length": 452.7589416503906,
+      "completions/mean_terminated_length": 452.7589416503906,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 2.5168945060613876,
+      "grad_norm": 0.6556716561317444,
+      "kl": 0.145263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0196,
+      "num_tokens": 286772861.0,
+      "reward": 1.5330358743667603,
+      "reward_std": 0.14051130414009094,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5330356955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.43597015738487244,
+      "step": 2439
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1012.0,
+      "completions/max_terminated_length": 1012.0,
+      "completions/mean_length": 463.982177734375,
+      "completions/mean_terminated_length": 463.982177734375,
+      "completions/min_length": 120.0,
+      "completions/min_terminated_length": 120.0,
+      "epoch": 2.5179262316223885,
+      "grad_norm": 0.6537541747093201,
+      "kl": 0.1416015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0075,
+      "num_tokens": 286884243.0,
+      "reward": 1.6959822177886963,
+      "reward_std": 0.15117120742797852,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6959820985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.4043200612068176,
+      "step": 2440
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1107.0,
+      "completions/max_terminated_length": 1107.0,
+      "completions/mean_length": 482.7232360839844,
+      "completions/mean_terminated_length": 482.7232360839844,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 2.5189579571833893,
+      "grad_norm": 0.688506543636322,
+      "kl": 0.156494140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0186,
+      "num_tokens": 287000055.0,
+      "reward": 1.5665180683135986,
+      "reward_std": 0.17180171608924866,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5665178894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.36020442843437195,
+      "step": 2441
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1895.0,
+      "completions/max_terminated_length": 1895.0,
+      "completions/mean_length": 468.77679443359375,
+      "completions/mean_terminated_length": 468.77679443359375,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 2.5199896827443897,
+      "grad_norm": 0.8272270560264587,
+      "kl": 0.14013671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0126,
+      "num_tokens": 287124553.0,
+      "reward": 1.5071429014205933,
+      "reward_std": 0.18399137258529663,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5071428418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4011165201663971,
+      "step": 2442
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1101.0,
+      "completions/max_terminated_length": 1101.0,
+      "completions/mean_length": 479.6785888671875,
+      "completions/mean_terminated_length": 479.6785888671875,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "epoch": 2.5210214083053906,
+      "grad_norm": 0.7374287843704224,
+      "kl": 0.159912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0235,
+      "num_tokens": 287249040.0,
+      "reward": 1.4357144832611084,
+      "reward_std": 0.23039209842681885,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4357143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3941822648048401,
+      "step": 2443
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1153.0,
+      "completions/max_terminated_length": 1153.0,
+      "completions/mean_length": 440.1160888671875,
+      "completions/mean_terminated_length": 440.1160888671875,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 2.5220531338663914,
+      "grad_norm": 0.833993136882782,
+      "kl": 0.154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0338,
+      "num_tokens": 287356440.0,
+      "reward": 1.5147321224212646,
+      "reward_std": 0.230870321393013,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5147321224212646,
+      "rewards/curriculum_aware_reward_fn/std": 0.3973809778690338,
+      "step": 2444
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2434.0,
+      "completions/max_terminated_length": 2434.0,
+      "completions/mean_length": 415.95538330078125,
+      "completions/mean_terminated_length": 415.95538330078125,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.5230848594273922,
+      "grad_norm": 0.8010690808296204,
+      "kl": 0.16357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0324,
+      "num_tokens": 287465558.0,
+      "reward": 1.6500000953674316,
+      "reward_std": 0.14846742153167725,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6500000357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.4071987271308899,
+      "step": 2445
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 834.0,
+      "completions/max_terminated_length": 834.0,
+      "completions/mean_length": 407.4910888671875,
+      "completions/mean_terminated_length": 407.4910888671875,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.524116584988393,
+      "grad_norm": 0.8469009399414062,
+      "kl": 0.158935546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0043,
+      "num_tokens": 287577354.0,
+      "reward": 1.6169644594192505,
+      "reward_std": 0.24044549465179443,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6169642806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.37425923347473145,
+      "step": 2446
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1300.0,
+      "completions/max_terminated_length": 1300.0,
+      "completions/mean_length": 460.5714416503906,
+      "completions/mean_terminated_length": 460.5714416503906,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.525148310549394,
+      "grad_norm": 0.8326296210289001,
+      "kl": 0.15771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 287696033.0,
+      "reward": 1.5566965341567993,
+      "reward_std": 0.2956428527832031,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5566964149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.44364863634109497,
+      "step": 2447
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 452.2500305175781,
+      "completions/mean_terminated_length": 452.2500305175781,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 2.5261800361103948,
+      "grad_norm": 0.8186579942703247,
+      "kl": 0.154052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0246,
+      "num_tokens": 287826422.0,
+      "reward": 1.4031251668930054,
+      "reward_std": 0.2201891839504242,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.39775729179382324,
+      "step": 2448
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 674.0,
+      "completions/max_terminated_length": 674.0,
+      "completions/mean_length": 439.58038330078125,
+      "completions/mean_terminated_length": 439.58038330078125,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.5272117616713956,
+      "grad_norm": 0.8152475357055664,
+      "kl": 0.16064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0321,
+      "num_tokens": 287944185.0,
+      "reward": 1.3558037281036377,
+      "reward_std": 0.22809594869613647,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35580354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.35966357588768005,
+      "step": 2449
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1347.0,
+      "completions/max_terminated_length": 1347.0,
+      "completions/mean_length": 466.9464416503906,
+      "completions/mean_terminated_length": 466.9464416503906,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.5282434872323964,
+      "grad_norm": 0.8093869090080261,
+      "kl": 0.15185546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0066,
+      "num_tokens": 288064782.0,
+      "reward": 1.4241071939468384,
+      "reward_std": 0.21188300848007202,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4241071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4358372986316681,
+      "step": 2450
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1476.0,
+      "completions/max_terminated_length": 1476.0,
+      "completions/mean_length": 466.7500305175781,
+      "completions/mean_terminated_length": 466.7500305175781,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 2.529275212793397,
+      "grad_norm": 0.6856550574302673,
+      "kl": 0.16357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0227,
+      "num_tokens": 288187863.0,
+      "reward": 1.5281250476837158,
+      "reward_std": 0.2091558426618576,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5370535850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4404515326023102,
+      "step": 2451
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1072.0,
+      "completions/max_terminated_length": 1072.0,
+      "completions/mean_length": 426.2232360839844,
+      "completions/mean_terminated_length": 426.2232360839844,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 2.5303069383543977,
+      "grad_norm": 0.8464824557304382,
+      "kl": 0.15869140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0258,
+      "num_tokens": 288296920.0,
+      "reward": 1.6477679014205933,
+      "reward_std": 0.1869218498468399,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6477679014205933,
+      "rewards/curriculum_aware_reward_fn/std": 0.39257684350013733,
+      "step": 2452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1098.0,
+      "completions/max_terminated_length": 1098.0,
+      "completions/mean_length": 461.7500305175781,
+      "completions/mean_terminated_length": 461.7500305175781,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.5313386639153985,
+      "grad_norm": 0.6828495860099792,
+      "kl": 0.148681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 288412826.0,
+      "reward": 1.6468751430511475,
+      "reward_std": 0.12923133373260498,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6468750238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.40265342593193054,
+      "step": 2453
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1999.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 492.77679443359375,
+      "completions/mean_terminated_length": 492.77679443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.5323703894763994,
+      "grad_norm": 0.7749906182289124,
+      "kl": 0.17529296875,
+      "learning_rate": 1e-06,
+      "loss": 0.011,
+      "num_tokens": 288540418.0,
+      "reward": 1.4218751192092896,
+      "reward_std": 0.22093766927719116,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4308035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.40354543924331665,
+      "step": 2454
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 995.0,
+      "completions/max_terminated_length": 995.0,
+      "completions/mean_length": 439.0089416503906,
+      "completions/mean_terminated_length": 439.0089416503906,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.5334021150374,
+      "grad_norm": 0.6202346682548523,
+      "kl": 0.15771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 288646217.0,
+      "reward": 1.4843751192092896,
+      "reward_std": 0.17100587487220764,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.484375,
+      "rewards/curriculum_aware_reward_fn/std": 0.445600688457489,
+      "step": 2455
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1362.0,
+      "completions/max_terminated_length": 1362.0,
+      "completions/mean_length": 475.4732360839844,
+      "completions/mean_terminated_length": 475.4732360839844,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.5344338405984006,
+      "grad_norm": 0.7288236618041992,
+      "kl": 0.156982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 288773341.0,
+      "reward": 1.487946629524231,
+      "reward_std": 0.21048535406589508,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48794645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.4141770005226135,
+      "step": 2456
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1653.0,
+      "completions/max_terminated_length": 1653.0,
+      "completions/mean_length": 464.51788330078125,
+      "completions/mean_terminated_length": 464.51788330078125,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.5354655661594014,
+      "grad_norm": 0.8047718405723572,
+      "kl": 0.157470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0037,
+      "num_tokens": 288888570.0,
+      "reward": 1.5767858028411865,
+      "reward_std": 0.2637113332748413,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5857143402099609,
+      "rewards/curriculum_aware_reward_fn/std": 0.45798367261886597,
+      "step": 2457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2133.0,
+      "completions/max_terminated_length": 2133.0,
+      "completions/mean_length": 588.8125,
+      "completions/mean_terminated_length": 588.8125,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 2.5364972917204023,
+      "grad_norm": 0.7194903492927551,
+      "kl": 0.14111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.009,
+      "num_tokens": 289035444.0,
+      "reward": 1.3660714626312256,
+      "reward_std": 0.18908150494098663,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3660714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.34296101331710815,
+      "step": 2458
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1347.0,
+      "completions/max_terminated_length": 1347.0,
+      "completions/mean_length": 534.1964721679688,
+      "completions/mean_terminated_length": 534.1964721679688,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 2.537529017281403,
+      "grad_norm": 0.7994199991226196,
+      "kl": 0.1484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0247,
+      "num_tokens": 289162278.0,
+      "reward": 1.3794645071029663,
+      "reward_std": 0.20744940638542175,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3883928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3257688283920288,
+      "step": 2459
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2228.0,
+      "completions/max_terminated_length": 2228.0,
+      "completions/mean_length": 562.669677734375,
+      "completions/mean_terminated_length": 562.669677734375,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 2.538560742842404,
+      "grad_norm": 0.7126930356025696,
+      "kl": 0.1357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.054,
+      "num_tokens": 289293904.0,
+      "reward": 1.5178571939468384,
+      "reward_std": 0.2616358697414398,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5267857313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.42170441150665283,
+      "step": 2460
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2335.0,
+      "completions/max_terminated_length": 2335.0,
+      "completions/mean_length": 581.3482666015625,
+      "completions/mean_terminated_length": 581.3482666015625,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 2.539592468403405,
+      "grad_norm": 0.709905207157135,
+      "kl": 0.136962890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0371,
+      "num_tokens": 289430240.0,
+      "reward": 1.3718750476837158,
+      "reward_std": 0.21493136882781982,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37187501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.4054127335548401,
+      "step": 2461
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2754.0,
+      "completions/max_terminated_length": 2754.0,
+      "completions/mean_length": 504.64288330078125,
+      "completions/mean_terminated_length": 504.64288330078125,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 2.5406241939644056,
+      "grad_norm": 0.7318965792655945,
+      "kl": 0.162109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0258,
+      "num_tokens": 289556102.0,
+      "reward": 1.6785714626312256,
+      "reward_std": 0.1515929102897644,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6785714030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.5129280686378479,
+      "step": 2462
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1980.0,
+      "completions/max_terminated_length": 1980.0,
+      "completions/mean_length": 463.5535888671875,
+      "completions/mean_terminated_length": 463.5535888671875,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 2.5416559195254065,
+      "grad_norm": 0.7726377248764038,
+      "kl": 0.161376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 289671160.0,
+      "reward": 1.6642857789993286,
+      "reward_std": 0.2374168336391449,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6642857193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.3770695626735687,
+      "step": 2463
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1769.0,
+      "completions/mean_length": 530.0803833007812,
+      "completions/mean_terminated_length": 497.9549560546875,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 2.542687645086407,
+      "grad_norm": 0.7113441228866577,
+      "kl": 0.16357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0104,
+      "num_tokens": 289794257.0,
+      "reward": 1.593750238418579,
+      "reward_std": 0.1650165319442749,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.59375,
+      "rewards/curriculum_aware_reward_fn/std": 0.430698424577713,
+      "step": 2464
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 984.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 452.4732360839844,
+      "completions/mean_terminated_length": 452.4732360839844,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.5437193706474077,
+      "grad_norm": 0.6848057508468628,
+      "kl": 0.156494140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0007,
+      "num_tokens": 289916425.0,
+      "reward": 1.5254465341567993,
+      "reward_std": 0.1454419493675232,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.534375011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.43521925806999207,
+      "step": 2465
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1088.0,
+      "completions/max_terminated_length": 1088.0,
+      "completions/mean_length": 517.5267944335938,
+      "completions/mean_terminated_length": 517.5267944335938,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 2.5447510962084086,
+      "grad_norm": 0.7695538401603699,
+      "kl": 0.15869140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0267,
+      "num_tokens": 290044710.0,
+      "reward": 1.4553571939468384,
+      "reward_std": 0.1838531196117401,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4642857015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.40981364250183105,
+      "step": 2466
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2111.0,
+      "completions/max_terminated_length": 2111.0,
+      "completions/mean_length": 508.46429443359375,
+      "completions/mean_terminated_length": 508.46429443359375,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 2.5457828217694094,
+      "grad_norm": 0.7199175357818604,
+      "kl": 0.166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0456,
+      "num_tokens": 290165421.0,
+      "reward": 1.6058037281036377,
+      "reward_std": 0.17861010134220123,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6147321462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.42673271894454956,
+      "step": 2467
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1183.0,
+      "completions/max_terminated_length": 1183.0,
+      "completions/mean_length": 485.08929443359375,
+      "completions/mean_terminated_length": 485.08929443359375,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 2.5468145473304102,
+      "grad_norm": 0.7515586614608765,
+      "kl": 0.159423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0235,
+      "num_tokens": 290293594.0,
+      "reward": 1.5303571224212646,
+      "reward_std": 0.21597933769226074,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5303571820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.41440051794052124,
+      "step": 2468
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2193.0,
+      "completions/max_terminated_length": 2193.0,
+      "completions/mean_length": 581.732177734375,
+      "completions/mean_terminated_length": 581.732177734375,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 2.5478462728914106,
+      "grad_norm": 0.7689985632896423,
+      "kl": 0.166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.011,
+      "num_tokens": 290426466.0,
+      "reward": 1.4830358028411865,
+      "reward_std": 0.2173272669315338,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48303574323654175,
+      "rewards/curriculum_aware_reward_fn/std": 0.3862241208553314,
+      "step": 2469
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2914.0,
+      "completions/max_terminated_length": 2914.0,
+      "completions/mean_length": 592.9107666015625,
+      "completions/mean_terminated_length": 592.9107666015625,
+      "completions/min_length": 120.0,
+      "completions/min_terminated_length": 120.0,
+      "epoch": 2.5488779984524115,
+      "grad_norm": 0.5767408013343811,
+      "kl": 0.160888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0269,
+      "num_tokens": 290551273.0,
+      "reward": 1.3696428537368774,
+      "reward_std": 0.172710120677948,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36964288353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.4238581657409668,
+      "step": 2470
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1677.0,
+      "completions/mean_length": 651.8482666015625,
+      "completions/mean_terminated_length": 589.2272338867188,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.5499097240134123,
+      "grad_norm": 0.6692609190940857,
+      "kl": 0.16162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0472,
+      "num_tokens": 290694525.0,
+      "reward": 1.5656250715255737,
+      "reward_std": 0.19344080984592438,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5745536088943481,
+      "rewards/curriculum_aware_reward_fn/std": 0.41611650586128235,
+      "step": 2471
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3462.0,
+      "completions/mean_length": 654.8392944335938,
+      "completions/mean_terminated_length": 623.8378295898438,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 2.550941449574413,
+      "grad_norm": 0.6685264706611633,
+      "kl": 0.1474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0319,
+      "num_tokens": 290833800.0,
+      "reward": 1.5674108266830444,
+      "reward_std": 0.24880172312259674,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5763393044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3816724717617035,
+      "step": 2472
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0357142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3203.0,
+      "completions/mean_length": 808.3660888671875,
+      "completions/mean_terminated_length": 686.6018676757812,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.551973175135414,
+      "grad_norm": 0.5768068432807922,
+      "kl": 0.14697265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0611,
+      "num_tokens": 290988506.0,
+      "reward": 1.541517972946167,
+      "reward_std": 0.15940067172050476,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.40374869108200073,
+      "step": 2473
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3218.0,
+      "completions/mean_length": 705.7767944335938,
+      "completions/mean_terminated_length": 675.2342529296875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.553004900696415,
+      "grad_norm": 0.6619222164154053,
+      "kl": 0.154052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0367,
+      "num_tokens": 291135173.0,
+      "reward": 1.5513393878936768,
+      "reward_std": 0.21687711775302887,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5513392686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.42178618907928467,
+      "step": 2474
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3998.0,
+      "completions/mean_length": 719.6875610351562,
+      "completions/mean_terminated_length": 689.270263671875,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 2.5540366262574157,
+      "grad_norm": 0.6049193739891052,
+      "kl": 0.142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0594,
+      "num_tokens": 291278225.0,
+      "reward": 1.6330357789993286,
+      "reward_std": 0.23981758952140808,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6330356597900391,
+      "rewards/curriculum_aware_reward_fn/std": 0.4030599892139435,
+      "step": 2475
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3030.0,
+      "completions/mean_length": 745.1160888671875,
+      "completions/mean_terminated_length": 714.9279174804688,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 2.5550683518184165,
+      "grad_norm": 0.5797421336174011,
+      "kl": 0.145263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 291426267.0,
+      "reward": 1.4638394117355347,
+      "reward_std": 0.2388923615217209,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4727678894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.4605562686920166,
+      "step": 2476
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3957.0,
+      "completions/max_terminated_length": 3957.0,
+      "completions/mean_length": 685.1517944335938,
+      "completions/mean_terminated_length": 685.1517944335938,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 2.556100077379417,
+      "grad_norm": 0.664794921875,
+      "kl": 0.151611328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0832,
+      "num_tokens": 291573347.0,
+      "reward": 1.4888393878936768,
+      "reward_std": 0.2121625393629074,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4888392984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.34755972027778625,
+      "step": 2477
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3135.0,
+      "completions/mean_length": 831.982177734375,
+      "completions/mean_terminated_length": 802.5765991210938,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 2.5571318029404178,
+      "grad_norm": 0.530683696269989,
+      "kl": 0.136962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0293,
+      "num_tokens": 291733806.0,
+      "reward": 1.433035969734192,
+      "reward_std": 0.2383945882320404,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4419642984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.43233877420425415,
+      "step": 2478
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3562.0,
+      "completions/max_terminated_length": 3562.0,
+      "completions/mean_length": 703.5267944335938,
+      "completions/mean_terminated_length": 703.5267944335938,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.5581635285014186,
+      "grad_norm": 0.5770664215087891,
+      "kl": 0.153564453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0261,
+      "num_tokens": 291882573.0,
+      "reward": 1.5433037281036377,
+      "reward_std": 0.21655043959617615,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5433036088943481,
+      "rewards/curriculum_aware_reward_fn/std": 0.4411030113697052,
+      "step": 2479
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3648.0,
+      "completions/mean_length": 762.1607666015625,
+      "completions/mean_terminated_length": 732.1261596679688,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 2.5591952540624194,
+      "grad_norm": 0.6695989370346069,
+      "kl": 0.157470703125,
+      "learning_rate": 1e-06,
+      "loss": -0.073,
+      "num_tokens": 292040721.0,
+      "reward": 1.4433035850524902,
+      "reward_std": 0.288400262594223,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47008928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.39154690504074097,
+      "step": 2480
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3680.0,
+      "completions/max_terminated_length": 3680.0,
+      "completions/mean_length": 700.5089721679688,
+      "completions/mean_terminated_length": 700.5089721679688,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.5602269796234203,
+      "grad_norm": 0.5907813906669617,
+      "kl": 0.138671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0907,
+      "num_tokens": 292186910.0,
+      "reward": 1.35535728931427,
+      "reward_std": 0.2883957028388977,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3732143044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.40201789140701294,
+      "step": 2481
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3300.0,
+      "completions/max_terminated_length": 3300.0,
+      "completions/mean_length": 612.6875,
+      "completions/mean_terminated_length": 612.6875,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.5612587051844207,
+      "grad_norm": 0.6172129511833191,
+      "kl": 0.143310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0363,
+      "num_tokens": 292317453.0,
+      "reward": 1.38660728931427,
+      "reward_std": 0.11005396395921707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.3735019564628601,
+      "step": 2482
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3715.0,
+      "completions/mean_length": 775.5982666015625,
+      "completions/mean_terminated_length": 745.6846923828125,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 2.5622904307454215,
+      "grad_norm": 0.5745029449462891,
+      "kl": 0.141357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0298,
+      "num_tokens": 292478247.0,
+      "reward": 1.450446605682373,
+      "reward_std": 0.24547609686851501,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45044639706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.35320258140563965,
+      "step": 2483
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3417.0,
+      "completions/max_terminated_length": 3417.0,
+      "completions/mean_length": 662.4642944335938,
+      "completions/mean_terminated_length": 662.4642944335938,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 2.5633221563064224,
+      "grad_norm": 0.6197950839996338,
+      "kl": 0.14501953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0226,
+      "num_tokens": 292615800.0,
+      "reward": 1.4772323369979858,
+      "reward_std": 0.2510889768600464,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.4198862612247467,
+      "step": 2484
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2696.0,
+      "completions/mean_length": 764.8035888671875,
+      "completions/mean_terminated_length": 704.236328125,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.564353881867423,
+      "grad_norm": 0.6195743083953857,
+      "kl": 0.138916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 292764185.0,
+      "reward": 1.4839287996292114,
+      "reward_std": 0.17358790338039398,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48392853140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.4302845299243927,
+      "step": 2485
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3639.0,
+      "completions/mean_length": 695.0892944335938,
+      "completions/mean_terminated_length": 664.450439453125,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 2.565385607428424,
+      "grad_norm": 0.6343938112258911,
+      "kl": 0.1455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0215,
+      "num_tokens": 292909837.0,
+      "reward": 1.516517996788025,
+      "reward_std": 0.2180844098329544,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.38554954528808594,
+      "step": 2486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1761.0,
+      "completions/mean_length": 628.9107666015625,
+      "completions/mean_terminated_length": 597.6757202148438,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.566417332989425,
+      "grad_norm": 0.6394612193107605,
+      "kl": 0.146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 293047174.0,
+      "reward": 1.5357145071029663,
+      "reward_std": 0.3192685842514038,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.4290902018547058,
+      "step": 2487
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2599.0,
+      "completions/mean_length": 652.607177734375,
+      "completions/mean_terminated_length": 621.5855712890625,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 2.5674490585504257,
+      "grad_norm": 0.7118006348609924,
+      "kl": 0.155029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0636,
+      "num_tokens": 293186388.0,
+      "reward": 1.5379464626312256,
+      "reward_std": 0.2745712697505951,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5379464030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.4722118675708771,
+      "step": 2488
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1626.0,
+      "completions/max_terminated_length": 1626.0,
+      "completions/mean_length": 582.7678833007812,
+      "completions/mean_terminated_length": 582.7678833007812,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 2.5684807841114266,
+      "grad_norm": 0.7305005192756653,
+      "kl": 0.150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0279,
+      "num_tokens": 293318242.0,
+      "reward": 1.3830358982086182,
+      "reward_std": 0.20811490714550018,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39196428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4081133008003235,
+      "step": 2489
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1658.0,
+      "completions/max_terminated_length": 1658.0,
+      "completions/mean_length": 516.3928833007812,
+      "completions/mean_terminated_length": 516.3928833007812,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 2.5695125096724274,
+      "grad_norm": 0.6877014636993408,
+      "kl": 0.151123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 293442949.0,
+      "reward": 1.517857313156128,
+      "reward_std": 0.17956021428108215,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5267857313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4478139281272888,
+      "step": 2490
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2395.0,
+      "completions/max_terminated_length": 2395.0,
+      "completions/mean_length": 593.1785888671875,
+      "completions/mean_terminated_length": 593.1785888671875,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.570544235233428,
+      "grad_norm": 0.7326036691665649,
+      "kl": 0.140869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0016,
+      "num_tokens": 293576907.0,
+      "reward": 1.5151785612106323,
+      "reward_std": 0.20150600373744965,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5151785612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.44112056493759155,
+      "step": 2491
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2668.0,
+      "completions/max_terminated_length": 2668.0,
+      "completions/mean_length": 655.4642944335938,
+      "completions/mean_terminated_length": 655.4642944335938,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.5715759607944286,
+      "grad_norm": 0.6929550766944885,
+      "kl": 0.137451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0143,
+      "num_tokens": 293719731.0,
+      "reward": 1.5000001192092896,
+      "reward_std": 0.22161515057086945,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5,
+      "rewards/curriculum_aware_reward_fn/std": 0.42511260509490967,
+      "step": 2492
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3589.0,
+      "completions/mean_length": 609.0892944335938,
+      "completions/mean_terminated_length": 577.6756591796875,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "epoch": 2.5726076863554295,
+      "grad_norm": 0.5321632623672485,
+      "kl": 0.1278076171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0153,
+      "num_tokens": 293849129.0,
+      "reward": 1.5357143878936768,
+      "reward_std": 0.14974889159202576,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.428302139043808,
+      "step": 2493
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1772.0,
+      "completions/max_terminated_length": 1772.0,
+      "completions/mean_length": 537.8482666015625,
+      "completions/mean_terminated_length": 537.8482666015625,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.5736394119164303,
+      "grad_norm": 0.7619335651397705,
+      "kl": 0.14697265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0658,
+      "num_tokens": 293969281.0,
+      "reward": 1.4763394594192505,
+      "reward_std": 0.21331331133842468,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47633928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.430794358253479,
+      "step": 2494
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2394.0,
+      "completions/max_terminated_length": 2394.0,
+      "completions/mean_length": 540.107177734375,
+      "completions/mean_terminated_length": 540.107177734375,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.574671137477431,
+      "grad_norm": 0.548896074295044,
+      "kl": 0.14501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0275,
+      "num_tokens": 294092948.0,
+      "reward": 1.4799107313156128,
+      "reward_std": 0.14194467663764954,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4799107015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.4888605773448944,
+      "step": 2495
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1709.0,
+      "completions/mean_length": 569.7232666015625,
+      "completions/mean_terminated_length": 537.9549560546875,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "epoch": 2.5757028630384315,
+      "grad_norm": 0.6730985641479492,
+      "kl": 0.1494140625,
+      "learning_rate": 1e-06,
+      "loss": -0.042,
+      "num_tokens": 294223376.0,
+      "reward": 1.5250000953674316,
+      "reward_std": 0.1675071120262146,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5250000357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.43933814764022827,
+      "step": 2496
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2365.0,
+      "completions/mean_length": 538.5089721679688,
+      "completions/mean_terminated_length": 506.45947265625,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 2.5767345885994324,
+      "grad_norm": 0.7262006402015686,
+      "kl": 0.150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 294352324.0,
+      "reward": 1.4763394594192505,
+      "reward_std": 0.15129299461841583,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47633931040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.41589802503585815,
+      "step": 2497
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2190.0,
+      "completions/max_terminated_length": 2190.0,
+      "completions/mean_length": 520.3660888671875,
+      "completions/mean_terminated_length": 520.3660888671875,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.5777663141604332,
+      "grad_norm": 0.6421841382980347,
+      "kl": 0.1376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0045,
+      "num_tokens": 294478512.0,
+      "reward": 1.708035945892334,
+      "reward_std": 0.20250266790390015,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7080357670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.5012845396995544,
+      "step": 2498
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3100.0,
+      "completions/max_terminated_length": 3100.0,
+      "completions/mean_length": 635.857177734375,
+      "completions/mean_terminated_length": 635.857177734375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 2.578798039721434,
+      "grad_norm": 0.6643877625465393,
+      "kl": 0.142333984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0229,
+      "num_tokens": 294619238.0,
+      "reward": 1.4223215579986572,
+      "reward_std": 0.20365062355995178,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4401785731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4161621928215027,
+      "step": 2499
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3352.0,
+      "completions/max_terminated_length": 3352.0,
+      "completions/mean_length": 602.0178833007812,
+      "completions/mean_terminated_length": 602.0178833007812,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 2.579829765282435,
+      "grad_norm": 0.5811689496040344,
+      "kl": 0.143310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 294755719.0,
+      "reward": 1.3437501192092896,
+      "reward_std": 0.15117493271827698,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34375,
+      "rewards/curriculum_aware_reward_fn/std": 0.41357186436653137,
+      "step": 2500
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2637.0,
+      "completions/max_terminated_length": 2637.0,
+      "completions/mean_length": 622.9910888671875,
+      "completions/mean_terminated_length": 622.9910888671875,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 2.5808614908434357,
+      "grad_norm": 0.7609834671020508,
+      "kl": 0.146484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0487,
+      "num_tokens": 294884238.0,
+      "reward": 1.6107144355773926,
+      "reward_std": 0.2443053424358368,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.610714316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.41678890585899353,
+      "step": 2501
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2255.0,
+      "completions/max_terminated_length": 2255.0,
+      "completions/mean_length": 649.2678833007812,
+      "completions/mean_terminated_length": 649.2678833007812,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.5818932164044366,
+      "grad_norm": 0.5464531183242798,
+      "kl": 0.144287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 295032479.0,
+      "reward": 1.6593750715255737,
+      "reward_std": 0.20846553146839142,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.659375011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.48923230171203613,
+      "step": 2502
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1999.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 578.4732666015625,
+      "completions/mean_terminated_length": 578.4732666015625,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 2.5829249419654374,
+      "grad_norm": 0.7156299948692322,
+      "kl": 0.1494140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0057,
+      "num_tokens": 295170887.0,
+      "reward": 1.5062501430511475,
+      "reward_std": 0.22756358981132507,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5062500238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.42288821935653687,
+      "step": 2503
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3690.0,
+      "completions/max_terminated_length": 3690.0,
+      "completions/mean_length": 775.9910888671875,
+      "completions/mean_terminated_length": 775.9910888671875,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.583956667526438,
+      "grad_norm": 0.5931575298309326,
+      "kl": 0.1270751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0924,
+      "num_tokens": 295333818.0,
+      "reward": 1.3321430683135986,
+      "reward_std": 0.22101646661758423,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3410714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.35004594922065735,
+      "step": 2504
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2471.0,
+      "completions/mean_length": 748.794677734375,
+      "completions/mean_terminated_length": 687.9363403320312,
+      "completions/min_length": 311.0,
+      "completions/min_terminated_length": 311.0,
+      "epoch": 2.5849883930874387,
+      "grad_norm": 0.6465186476707458,
+      "kl": 0.1312255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0172,
+      "num_tokens": 295493220.0,
+      "reward": 1.3633930683135986,
+      "reward_std": 0.2086215764284134,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36339282989501953,
+      "rewards/curriculum_aware_reward_fn/std": 0.468992680311203,
+      "step": 2505
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1713.0,
+      "completions/max_terminated_length": 1713.0,
+      "completions/mean_length": 576.919677734375,
+      "completions/mean_terminated_length": 576.919677734375,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "epoch": 2.5860201186484395,
+      "grad_norm": 0.6617150902748108,
+      "kl": 0.138916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 295626477.0,
+      "reward": 1.6214287281036377,
+      "reward_std": 0.24607469141483307,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6214286088943481,
+      "rewards/curriculum_aware_reward_fn/std": 0.5417153835296631,
+      "step": 2506
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2568.0,
+      "completions/max_terminated_length": 2568.0,
+      "completions/mean_length": 541.5267944335938,
+      "completions/mean_terminated_length": 541.5267944335938,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 2.5870518442094403,
+      "grad_norm": 0.6159235239028931,
+      "kl": 0.141357421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0058,
+      "num_tokens": 295751087.0,
+      "reward": 1.4861608743667603,
+      "reward_std": 0.16843244433403015,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49508926272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.4519207775592804,
+      "step": 2507
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3286.0,
+      "completions/max_terminated_length": 3286.0,
+      "completions/mean_length": 628.2589721679688,
+      "completions/mean_terminated_length": 628.2589721679688,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.588083569770441,
+      "grad_norm": 0.6114004254341125,
+      "kl": 0.144775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0077,
+      "num_tokens": 295886202.0,
+      "reward": 1.3754465579986572,
+      "reward_std": 0.17992451786994934,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3933035731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.39045456051826477,
+      "step": 2508
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2216.0,
+      "completions/max_terminated_length": 2216.0,
+      "completions/mean_length": 498.1875305175781,
+      "completions/mean_terminated_length": 498.1875305175781,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 2.5891152953314416,
+      "grad_norm": 0.7495975494384766,
+      "kl": 0.16357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0012,
+      "num_tokens": 296007624.0,
+      "reward": 1.6892858743667603,
+      "reward_std": 0.1631748378276825,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6892856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.5288279056549072,
+      "step": 2509
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2493.0,
+      "completions/max_terminated_length": 2493.0,
+      "completions/mean_length": 536.2232666015625,
+      "completions/mean_terminated_length": 536.2232666015625,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.5901470208924424,
+      "grad_norm": 0.8427813649177551,
+      "kl": 0.13330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0517,
+      "num_tokens": 296133411.0,
+      "reward": 1.5415178537368774,
+      "reward_std": 0.25848016142845154,
+      "rewards/code_format_reward/mean": 0.9553571343421936,
+      "rewards/code_format_reward/std": 0.2074466347694397,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.45590272545814514,
+      "step": 2510
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1591.0,
+      "completions/mean_length": 683.0803833007812,
+      "completions/mean_terminated_length": 621.0272827148438,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 2.5911787464534433,
+      "grad_norm": 0.6702219843864441,
+      "kl": 0.12060546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 296274449.0,
+      "reward": 1.5794644355773926,
+      "reward_std": 0.25675874948501587,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.579464316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.41868269443511963,
+      "step": 2511
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3516.0,
+      "completions/max_terminated_length": 3516.0,
+      "completions/mean_length": 546.6160888671875,
+      "completions/mean_terminated_length": 546.6160888671875,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.592210472014444,
+      "grad_norm": 0.690642237663269,
+      "kl": 0.136474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 296393994.0,
+      "reward": 1.532142996788025,
+      "reward_std": 0.1899871975183487,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.550000011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.4154125154018402,
+      "step": 2512
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2391.0,
+      "completions/max_terminated_length": 2391.0,
+      "completions/mean_length": 629.4732666015625,
+      "completions/mean_terminated_length": 629.4732666015625,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 2.593242197575445,
+      "grad_norm": 0.7589452266693115,
+      "kl": 0.14111328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0108,
+      "num_tokens": 296531667.0,
+      "reward": 1.415178656578064,
+      "reward_std": 0.23947352170944214,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4151785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.4060266613960266,
+      "step": 2513
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1762.0,
+      "completions/max_terminated_length": 1762.0,
+      "completions/mean_length": 581.8482666015625,
+      "completions/mean_terminated_length": 581.8482666015625,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 2.594273923136446,
+      "grad_norm": 0.6457692384719849,
+      "kl": 0.137451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0044,
+      "num_tokens": 296664573.0,
+      "reward": 1.4245537519454956,
+      "reward_std": 0.15253514051437378,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42455360293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.4164952039718628,
+      "step": 2514
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3557.0,
+      "completions/max_terminated_length": 3557.0,
+      "completions/mean_length": 605.0535888671875,
+      "completions/mean_terminated_length": 605.0535888671875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.5953056486974466,
+      "grad_norm": 0.7411686778068542,
+      "kl": 0.1356201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0455,
+      "num_tokens": 296801758.0,
+      "reward": 1.4861607551574707,
+      "reward_std": 0.18960067629814148,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4861607253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.35133013129234314,
+      "step": 2515
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2271.0,
+      "completions/max_terminated_length": 2271.0,
+      "completions/mean_length": 551.9285888671875,
+      "completions/mean_terminated_length": 551.9285888671875,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 2.5963373742584475,
+      "grad_norm": 0.6617128849029541,
+      "kl": 0.13623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0369,
+      "num_tokens": 296921534.0,
+      "reward": 1.532142996788025,
+      "reward_std": 0.16609406471252441,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5321428179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.43974077701568604,
+      "step": 2516
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1540.0,
+      "completions/max_terminated_length": 1540.0,
+      "completions/mean_length": 578.3660888671875,
+      "completions/mean_terminated_length": 578.3660888671875,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 2.5973690998194483,
+      "grad_norm": 0.6656718850135803,
+      "kl": 0.1278076171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 297058908.0,
+      "reward": 1.3535715341567993,
+      "reward_std": 0.27912285923957825,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35357141494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.4194437861442566,
+      "step": 2517
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2031.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 581.7589721679688,
+      "completions/mean_terminated_length": 581.7589721679688,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.5984008253804487,
+      "grad_norm": 0.6276634931564331,
+      "kl": 0.12451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0349,
+      "num_tokens": 297194562.0,
+      "reward": 1.614732265472412,
+      "reward_std": 0.2275889366865158,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6236607432365417,
+      "rewards/curriculum_aware_reward_fn/std": 0.38208532333374023,
+      "step": 2518
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1959.0,
+      "completions/max_terminated_length": 1959.0,
+      "completions/mean_length": 557.75,
+      "completions/mean_terminated_length": 557.75,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.5994325509414495,
+      "grad_norm": 0.6883637309074402,
+      "kl": 0.12890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0231,
+      "num_tokens": 297318364.0,
+      "reward": 1.4714287519454956,
+      "reward_std": 0.18093356490135193,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.43937474489212036,
+      "step": 2519
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1787.0,
+      "completions/mean_length": 554.3482666015625,
+      "completions/mean_terminated_length": 522.4414672851562,
+      "completions/min_length": 141.0,
+      "completions/min_terminated_length": 141.0,
+      "epoch": 2.6004642765024504,
+      "grad_norm": 0.6628627777099609,
+      "kl": 0.1385498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 297437158.0,
+      "reward": 1.5455358028411865,
+      "reward_std": 0.21819821000099182,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5544642806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.43119117617607117,
+      "step": 2520
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1789.0,
+      "completions/mean_length": 547.25,
+      "completions/mean_terminated_length": 515.279296875,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 2.601496002063451,
+      "grad_norm": 0.673978865146637,
+      "kl": 0.14794921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 297566210.0,
+      "reward": 1.531250238418579,
+      "reward_std": 0.22155684232711792,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.53125,
+      "rewards/curriculum_aware_reward_fn/std": 0.39066386222839355,
+      "step": 2521
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3205.0,
+      "completions/max_terminated_length": 3205.0,
+      "completions/mean_length": 599.8303833007812,
+      "completions/mean_terminated_length": 599.8303833007812,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 2.602527727624452,
+      "grad_norm": 0.7220893502235413,
+      "kl": 0.127685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0695,
+      "num_tokens": 297710517.0,
+      "reward": 1.4861608743667603,
+      "reward_std": 0.21848052740097046,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4861607253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.41774865984916687,
+      "step": 2522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2624.0,
+      "completions/max_terminated_length": 2624.0,
+      "completions/mean_length": 521.732177734375,
+      "completions/mean_terminated_length": 521.732177734375,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 2.6035594531854525,
+      "grad_norm": 0.6924219131469727,
+      "kl": 0.1416015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0115,
+      "num_tokens": 297828591.0,
+      "reward": 1.5562502145767212,
+      "reward_std": 0.19886499643325806,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5562499761581421,
+      "rewards/curriculum_aware_reward_fn/std": 0.5339013338088989,
+      "step": 2523
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2363.0,
+      "completions/max_terminated_length": 2363.0,
+      "completions/mean_length": 526.1875,
+      "completions/mean_terminated_length": 526.1875,
+      "completions/min_length": 85.0,
+      "completions/min_terminated_length": 85.0,
+      "epoch": 2.6045911787464533,
+      "grad_norm": 0.6960251927375793,
+      "kl": 0.128662109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0337,
+      "num_tokens": 297961640.0,
+      "reward": 1.5500000715255737,
+      "reward_std": 0.24628612399101257,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5589285492897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.4280804693698883,
+      "step": 2524
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2201.0,
+      "completions/max_terminated_length": 2201.0,
+      "completions/mean_length": 553.5535888671875,
+      "completions/mean_terminated_length": 553.5535888671875,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.605622904307454,
+      "grad_norm": 0.6908318400382996,
+      "kl": 0.1285400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0614,
+      "num_tokens": 298090236.0,
+      "reward": 1.440178632736206,
+      "reward_std": 0.20642748475074768,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4401785731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.3992582857608795,
+      "step": 2525
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1919.0,
+      "completions/mean_length": 617.3482666015625,
+      "completions/mean_terminated_length": 586.009033203125,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 2.606654629868455,
+      "grad_norm": 0.5960627794265747,
+      "kl": 0.13525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.024,
+      "num_tokens": 298226439.0,
+      "reward": 1.3562501668930054,
+      "reward_std": 0.17817705869674683,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.40637513995170593,
+      "step": 2526
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1974.0,
+      "completions/max_terminated_length": 1974.0,
+      "completions/mean_length": 567.3214721679688,
+      "completions/mean_terminated_length": 567.3214721679688,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.607686355429456,
+      "grad_norm": 0.6354534029960632,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0303,
+      "num_tokens": 298359160.0,
+      "reward": 1.4379465579986572,
+      "reward_std": 0.1600869745016098,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43794646859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.4330514669418335,
+      "step": 2527
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2765.0,
+      "completions/max_terminated_length": 2765.0,
+      "completions/mean_length": 476.1607360839844,
+      "completions/mean_terminated_length": 476.1607360839844,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 2.6087180809904567,
+      "grad_norm": 0.6645422577857971,
+      "kl": 0.1376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0219,
+      "num_tokens": 298470590.0,
+      "reward": 1.6022323369979858,
+      "reward_std": 0.24282428622245789,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6111606955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.43109485507011414,
+      "step": 2528
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3282.0,
+      "completions/max_terminated_length": 3282.0,
+      "completions/mean_length": 510.7410888671875,
+      "completions/mean_terminated_length": 510.7410888671875,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.6097498065514575,
+      "grad_norm": 0.845571756362915,
+      "kl": 0.14794921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0062,
+      "num_tokens": 298594391.0,
+      "reward": 1.505357265472412,
+      "reward_std": 0.19354453682899475,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5053571462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.4207113981246948,
+      "step": 2529
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1820.0,
+      "completions/max_terminated_length": 1820.0,
+      "completions/mean_length": 572.3214721679688,
+      "completions/mean_terminated_length": 572.3214721679688,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 2.6107815321124583,
+      "grad_norm": 0.6434414982795715,
+      "kl": 0.142333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0562,
+      "num_tokens": 298721022.0,
+      "reward": 1.5629465579986572,
+      "reward_std": 0.21066059172153473,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5629464387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.41366007924079895,
+      "step": 2530
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1742.0,
+      "completions/max_terminated_length": 1742.0,
+      "completions/mean_length": 463.01788330078125,
+      "completions/mean_terminated_length": 463.01788330078125,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.6118132576734587,
+      "grad_norm": 0.593588650226593,
+      "kl": 0.1396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0175,
+      "num_tokens": 298840143.0,
+      "reward": 1.5968750715255737,
+      "reward_std": 0.1363830864429474,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.596875011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.4179142415523529,
+      "step": 2531
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1429.0,
+      "completions/max_terminated_length": 1429.0,
+      "completions/mean_length": 498.76788330078125,
+      "completions/mean_terminated_length": 498.76788330078125,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 2.6128449832344596,
+      "grad_norm": 0.7416149377822876,
+      "kl": 0.1319580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 298963800.0,
+      "reward": 1.48035728931427,
+      "reward_std": 0.1891711801290512,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4803571403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.4251844882965088,
+      "step": 2532
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2005.0,
+      "completions/max_terminated_length": 2005.0,
+      "completions/mean_length": 556.6785888671875,
+      "completions/mean_terminated_length": 556.6785888671875,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 2.6138767087954604,
+      "grad_norm": 0.7315823435783386,
+      "kl": 0.1357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.03,
+      "num_tokens": 299091003.0,
+      "reward": 1.3558037281036377,
+      "reward_std": 0.25385361909866333,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36473211646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.3844756782054901,
+      "step": 2533
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1647.0,
+      "completions/max_terminated_length": 1647.0,
+      "completions/mean_length": 469.3660888671875,
+      "completions/mean_terminated_length": 469.3660888671875,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 2.6149084343564613,
+      "grad_norm": 0.6682936549186707,
+      "kl": 0.135986328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0119,
+      "num_tokens": 299204115.0,
+      "reward": 1.5580357313156128,
+      "reward_std": 0.1919260323047638,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5580357313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.422644704580307,
+      "step": 2534
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1236.0,
+      "completions/max_terminated_length": 1236.0,
+      "completions/mean_length": 458.0535888671875,
+      "completions/mean_terminated_length": 458.0535888671875,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.615940159917462,
+      "grad_norm": 0.6064454317092896,
+      "kl": 0.13720703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0192,
+      "num_tokens": 299328525.0,
+      "reward": 1.5906251668930054,
+      "reward_std": 0.18175432085990906,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5995535850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4349845051765442,
+      "step": 2535
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1414.0,
+      "completions/max_terminated_length": 1414.0,
+      "completions/mean_length": 455.51788330078125,
+      "completions/mean_terminated_length": 455.51788330078125,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 2.6169718854784625,
+      "grad_norm": 0.7744899988174438,
+      "kl": 0.130126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0569,
+      "num_tokens": 299441549.0,
+      "reward": 1.552232265472412,
+      "reward_std": 0.24021115899085999,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.561160683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.38687413930892944,
+      "step": 2536
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1580.0,
+      "completions/max_terminated_length": 1580.0,
+      "completions/mean_length": 448.5714416503906,
+      "completions/mean_terminated_length": 448.5714416503906,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 2.6180036110394633,
+      "grad_norm": 0.5266478061676025,
+      "kl": 0.1337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0408,
+      "num_tokens": 299564078.0,
+      "reward": 1.6906249523162842,
+      "reward_std": 0.0914338007569313,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6906250715255737,
+      "rewards/curriculum_aware_reward_fn/std": 0.4176042079925537,
+      "step": 2537
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1972.0,
+      "completions/max_terminated_length": 1972.0,
+      "completions/mean_length": 521.294677734375,
+      "completions/mean_terminated_length": 521.294677734375,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 2.619035336600464,
+      "grad_norm": 0.6374189853668213,
+      "kl": 0.1285400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0075,
+      "num_tokens": 299684952.0,
+      "reward": 1.4250000715255737,
+      "reward_std": 0.14668266475200653,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.43959441781044006,
+      "step": 2538
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3921.0,
+      "completions/max_terminated_length": 3921.0,
+      "completions/mean_length": 571.7589721679688,
+      "completions/mean_terminated_length": 571.7589721679688,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 2.620067062161465,
+      "grad_norm": 0.6008320450782776,
+      "kl": 0.11767578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0187,
+      "num_tokens": 299816427.0,
+      "reward": 1.3428572416305542,
+      "reward_std": 0.22932888567447662,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.39329564571380615,
+      "step": 2539
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2890.0,
+      "completions/max_terminated_length": 2890.0,
+      "completions/mean_length": 609.6785888671875,
+      "completions/mean_terminated_length": 609.6785888671875,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 2.621098787722466,
+      "grad_norm": 0.5431801676750183,
+      "kl": 0.1221923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0254,
+      "num_tokens": 299961053.0,
+      "reward": 1.5415178537368774,
+      "reward_std": 0.17546537518501282,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.45161813497543335,
+      "step": 2540
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1227.0,
+      "completions/max_terminated_length": 1227.0,
+      "completions/mean_length": 449.982177734375,
+      "completions/mean_terminated_length": 449.982177734375,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 2.6221305132834667,
+      "grad_norm": 0.7313409447669983,
+      "kl": 0.13720703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0164,
+      "num_tokens": 300079855.0,
+      "reward": 1.6004464626312256,
+      "reward_std": 0.1854362189769745,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.609375,
+      "rewards/curriculum_aware_reward_fn/std": 0.42162978649139404,
+      "step": 2541
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3242.0,
+      "completions/max_terminated_length": 3242.0,
+      "completions/mean_length": 606.7857666015625,
+      "completions/mean_terminated_length": 606.7857666015625,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 2.6231622388444675,
+      "grad_norm": 0.6683008074760437,
+      "kl": 0.106201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 300214511.0,
+      "reward": 1.411160945892334,
+      "reward_std": 0.22779394686222076,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41116073727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.40944427251815796,
+      "step": 2542
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1575.0,
+      "completions/mean_length": 567.5089721679688,
+      "completions/mean_terminated_length": 535.7207641601562,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.6241939644054684,
+      "grad_norm": 0.5849696397781372,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0309,
+      "num_tokens": 300350543.0,
+      "reward": 1.4758929014205933,
+      "reward_std": 0.21480746567249298,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4758928120136261,
+      "rewards/curriculum_aware_reward_fn/std": 0.4028604030609131,
+      "step": 2543
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1424.0,
+      "completions/max_terminated_length": 1424.0,
+      "completions/mean_length": 483.95538330078125,
+      "completions/mean_terminated_length": 483.95538330078125,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.6252256899664688,
+      "grad_norm": 0.6758949756622314,
+      "kl": 0.1143798828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0239,
+      "num_tokens": 300473262.0,
+      "reward": 1.5455358028411865,
+      "reward_std": 0.27307796478271484,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5633928775787354,
+      "rewards/curriculum_aware_reward_fn/std": 0.41405391693115234,
+      "step": 2544
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3222.0,
+      "completions/max_terminated_length": 3222.0,
+      "completions/mean_length": 604.9553833007812,
+      "completions/mean_terminated_length": 604.9553833007812,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 2.6262574155274696,
+      "grad_norm": 0.6471558213233948,
+      "kl": 0.122314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.027,
+      "num_tokens": 300611724.0,
+      "reward": 1.3008928298950195,
+      "reward_std": 0.23042990267276764,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.30089282989501953,
+      "rewards/curriculum_aware_reward_fn/std": 0.396492600440979,
+      "step": 2545
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3957.0,
+      "completions/max_terminated_length": 3957.0,
+      "completions/mean_length": 596.3303833007812,
+      "completions/mean_terminated_length": 596.3303833007812,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 2.6272891410884704,
+      "grad_norm": 0.6921994090080261,
+      "kl": 0.11474609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 300742122.0,
+      "reward": 1.4174107313156128,
+      "reward_std": 0.22880640625953674,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4174107015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.41745203733444214,
+      "step": 2546
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1735.0,
+      "completions/max_terminated_length": 1735.0,
+      "completions/mean_length": 408.3571472167969,
+      "completions/mean_terminated_length": 408.3571472167969,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.6283208666494713,
+      "grad_norm": 0.7727930545806885,
+      "kl": 0.1363525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0327,
+      "num_tokens": 300849696.0,
+      "reward": 1.610267996788025,
+      "reward_std": 0.22193004190921783,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6102678179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.49160236120224,
+      "step": 2547
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1853.0,
+      "completions/max_terminated_length": 1853.0,
+      "completions/mean_length": 487.9285888671875,
+      "completions/mean_terminated_length": 487.9285888671875,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 2.629352592210472,
+      "grad_norm": 0.603735089302063,
+      "kl": 0.12255859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0085,
+      "num_tokens": 300972384.0,
+      "reward": 1.5267857313156128,
+      "reward_std": 0.12053783982992172,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5267857313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4188104569911957,
+      "step": 2548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2411.0,
+      "completions/max_terminated_length": 2411.0,
+      "completions/mean_length": 566.6607666015625,
+      "completions/mean_terminated_length": 566.6607666015625,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 2.6303843177714725,
+      "grad_norm": 0.5660789608955383,
+      "kl": 0.1123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0688,
+      "num_tokens": 301105752.0,
+      "reward": 1.4008928537368774,
+      "reward_std": 0.2004423588514328,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40089282393455505,
+      "rewards/curriculum_aware_reward_fn/std": 0.4820649325847626,
+      "step": 2549
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3659.0,
+      "completions/max_terminated_length": 3659.0,
+      "completions/mean_length": 515.2857666015625,
+      "completions/mean_terminated_length": 515.2857666015625,
+      "completions/min_length": 151.0,
+      "completions/min_terminated_length": 151.0,
+      "epoch": 2.6314160433324734,
+      "grad_norm": 0.5825809240341187,
+      "kl": 0.1177978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.048,
+      "num_tokens": 301230902.0,
+      "reward": 1.4799107313156128,
+      "reward_std": 0.16736774146556854,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4888392984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.43093812465667725,
+      "step": 2550
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2199.0,
+      "completions/max_terminated_length": 2199.0,
+      "completions/mean_length": 524.2678833007812,
+      "completions/mean_terminated_length": 524.2678833007812,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.632447768893474,
+      "grad_norm": 0.6821950078010559,
+      "kl": 0.1357421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 301361853.0,
+      "reward": 1.5075894594192505,
+      "reward_std": 0.24523700773715973,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.4395429491996765,
+      "step": 2551
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1749.0,
+      "completions/max_terminated_length": 1749.0,
+      "completions/mean_length": 484.2857360839844,
+      "completions/mean_terminated_length": 484.2857360839844,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.633479494454475,
+      "grad_norm": 0.7281054854393005,
+      "kl": 0.130126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0172,
+      "num_tokens": 301483888.0,
+      "reward": 1.4991072416305542,
+      "reward_std": 0.23039059340953827,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5080357193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.42471790313720703,
+      "step": 2552
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2207.0,
+      "completions/max_terminated_length": 2207.0,
+      "completions/mean_length": 474.27679443359375,
+      "completions/mean_terminated_length": 474.27679443359375,
+      "completions/min_length": 134.0,
+      "completions/min_terminated_length": 134.0,
+      "epoch": 2.634511220015476,
+      "grad_norm": 0.7366987466812134,
+      "kl": 0.1273193359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0212,
+      "num_tokens": 301610850.0,
+      "reward": 1.4883930683135986,
+      "reward_std": 0.16512981057167053,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48839282989501953,
+      "rewards/curriculum_aware_reward_fn/std": 0.4101187586784363,
+      "step": 2553
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1498.0,
+      "completions/max_terminated_length": 1498.0,
+      "completions/mean_length": 478.6607360839844,
+      "completions/mean_terminated_length": 478.6607360839844,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.6355429455764767,
+      "grad_norm": 0.7663374543190002,
+      "kl": 0.1181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0367,
+      "num_tokens": 301738373.0,
+      "reward": 1.5843751430511475,
+      "reward_std": 0.13005556166172028,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5843749642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.3850255310535431,
+      "step": 2554
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1332.0,
+      "completions/max_terminated_length": 1332.0,
+      "completions/mean_length": 518.6964721679688,
+      "completions/mean_terminated_length": 518.6964721679688,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 2.6365746711374776,
+      "grad_norm": 0.656261682510376,
+      "kl": 0.1309814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0356,
+      "num_tokens": 301862337.0,
+      "reward": 1.3651787042617798,
+      "reward_std": 0.18952900171279907,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36517858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.38723084330558777,
+      "step": 2555
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2099.0,
+      "completions/max_terminated_length": 2099.0,
+      "completions/mean_length": 520.4732666015625,
+      "completions/mean_terminated_length": 520.4732666015625,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.6376063966984784,
+      "grad_norm": 0.6241275668144226,
+      "kl": 0.1243896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.014,
+      "num_tokens": 301995817.0,
+      "reward": 1.55848228931427,
+      "reward_std": 0.18228021264076233,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5584821701049805,
+      "rewards/curriculum_aware_reward_fn/std": 0.4161648452281952,
+      "step": 2556
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1396.0,
+      "completions/max_terminated_length": 1396.0,
+      "completions/mean_length": 475.0982360839844,
+      "completions/mean_terminated_length": 475.0982360839844,
+      "completions/min_length": 127.0,
+      "completions/min_terminated_length": 127.0,
+      "epoch": 2.6386381222594792,
+      "grad_norm": 0.6838352680206299,
+      "kl": 0.1373291015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 302114982.0,
+      "reward": 1.4785715341567993,
+      "reward_std": 0.18479207158088684,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47857144474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.4036831855773926,
+      "step": 2557
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2031.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 562.3392944335938,
+      "completions/mean_terminated_length": 562.3392944335938,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 2.6396698478204796,
+      "grad_norm": 0.7007812261581421,
+      "kl": 0.1204833984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0091,
+      "num_tokens": 302251226.0,
+      "reward": 1.380357265472412,
+      "reward_std": 0.22040167450904846,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38035711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.38399022817611694,
+      "step": 2558
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1885.0,
+      "completions/max_terminated_length": 1885.0,
+      "completions/mean_length": 521.9107666015625,
+      "completions/mean_terminated_length": 521.9107666015625,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.6407015733814805,
+      "grad_norm": 0.7604739665985107,
+      "kl": 0.124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0204,
+      "num_tokens": 302379274.0,
+      "reward": 1.4879463911056519,
+      "reward_std": 0.24727526307106018,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48794645071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.3997885584831238,
+      "step": 2559
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2391.0,
+      "completions/max_terminated_length": 2391.0,
+      "completions/mean_length": 510.294677734375,
+      "completions/mean_terminated_length": 510.294677734375,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.6417332989424813,
+      "grad_norm": 0.7016016244888306,
+      "kl": 0.1258544921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0323,
+      "num_tokens": 302505951.0,
+      "reward": 1.4575893878936768,
+      "reward_std": 0.16625343263149261,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4665178656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4224226474761963,
+      "step": 2560
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1456.0,
+      "completions/max_terminated_length": 1456.0,
+      "completions/mean_length": 479.9464416503906,
+      "completions/mean_terminated_length": 479.9464416503906,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 2.642765024503482,
+      "grad_norm": 0.6770477890968323,
+      "kl": 0.1195068359375,
+      "learning_rate": 1e-06,
+      "loss": -0.002,
+      "num_tokens": 302624770.0,
+      "reward": 1.5647321939468384,
+      "reward_std": 0.15951691567897797,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5647321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4194396734237671,
+      "step": 2561
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2726.0,
+      "completions/max_terminated_length": 2726.0,
+      "completions/mean_length": 537.6785888671875,
+      "completions/mean_terminated_length": 537.6785888671875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 2.643796750064483,
+      "grad_norm": 0.7363307476043701,
+      "kl": 0.116455078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0104,
+      "num_tokens": 302751000.0,
+      "reward": 1.5415178537368774,
+      "reward_std": 0.16541747748851776,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.39824438095092773,
+      "step": 2562
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1431.0,
+      "completions/max_terminated_length": 1431.0,
+      "completions/mean_length": 508.4375305175781,
+      "completions/mean_terminated_length": 508.4375305175781,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 2.6448284756254834,
+      "grad_norm": 0.7421542406082153,
+      "kl": 0.1240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0049,
+      "num_tokens": 302873878.0,
+      "reward": 1.5535715818405151,
+      "reward_std": 0.2116728276014328,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5535714030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.39322611689567566,
+      "step": 2563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1970.0,
+      "completions/max_terminated_length": 1970.0,
+      "completions/mean_length": 564.5892944335938,
+      "completions/mean_terminated_length": 564.5892944335938,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 2.6458602011864842,
+      "grad_norm": 0.7115163207054138,
+      "kl": 0.11474609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0162,
+      "num_tokens": 303002751.0,
+      "reward": 1.4352679252624512,
+      "reward_std": 0.16417628526687622,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.38541179895401,
+      "step": 2564
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2077.0,
+      "completions/max_terminated_length": 2077.0,
+      "completions/mean_length": 513.625,
+      "completions/mean_terminated_length": 513.625,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.646891926747485,
+      "grad_norm": 0.6098670363426208,
+      "kl": 0.1158447265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0002,
+      "num_tokens": 303134155.0,
+      "reward": 1.5218751430511475,
+      "reward_std": 0.16423504054546356,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.44178083539009094,
+      "step": 2565
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2776.0,
+      "completions/max_terminated_length": 2776.0,
+      "completions/mean_length": 496.76788330078125,
+      "completions/mean_terminated_length": 496.76788330078125,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 2.647923652308486,
+      "grad_norm": 0.7642014026641846,
+      "kl": 0.1285400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0455,
+      "num_tokens": 303257196.0,
+      "reward": 1.4812500476837158,
+      "reward_std": 0.20399358868598938,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49910715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.4325694441795349,
+      "step": 2566
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1803.0,
+      "completions/max_terminated_length": 1803.0,
+      "completions/mean_length": 557.1160888671875,
+      "completions/mean_terminated_length": 557.1160888671875,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 2.6489553778694868,
+      "grad_norm": 0.6560217142105103,
+      "kl": 0.1214599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0226,
+      "num_tokens": 303389964.0,
+      "reward": 1.4401787519454956,
+      "reward_std": 0.24112722277641296,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4401785731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.3977888822555542,
+      "step": 2567
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2036.0,
+      "completions/max_terminated_length": 2036.0,
+      "completions/mean_length": 537.2142944335938,
+      "completions/mean_terminated_length": 537.2142944335938,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.6499871034304876,
+      "grad_norm": 0.6953210234642029,
+      "kl": 0.1326904296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0027,
+      "num_tokens": 303512992.0,
+      "reward": 1.541517972946167,
+      "reward_std": 0.17343279719352722,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.4311564266681671,
+      "step": 2568
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2856.0,
+      "completions/max_terminated_length": 2856.0,
+      "completions/mean_length": 517.669677734375,
+      "completions/mean_terminated_length": 517.669677734375,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.6510188289914884,
+      "grad_norm": 0.6415396928787231,
+      "kl": 0.114013671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0173,
+      "num_tokens": 303638171.0,
+      "reward": 1.446428656578064,
+      "reward_std": 0.16090556979179382,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4464285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.43973347544670105,
+      "step": 2569
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1144.0,
+      "completions/max_terminated_length": 1144.0,
+      "completions/mean_length": 457.3660888671875,
+      "completions/mean_terminated_length": 457.3660888671875,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 2.6520505545524893,
+      "grad_norm": 1.054216980934143,
+      "kl": 0.136962890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0309,
+      "num_tokens": 303756085.0,
+      "reward": 1.4754465818405151,
+      "reward_std": 0.2000415027141571,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4754464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.43074390292167664,
+      "step": 2570
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1692.0,
+      "completions/max_terminated_length": 1692.0,
+      "completions/mean_length": 505.46429443359375,
+      "completions/mean_terminated_length": 505.46429443359375,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.6530822801134897,
+      "grad_norm": 0.6751531958580017,
+      "kl": 0.1279296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0585,
+      "num_tokens": 303874836.0,
+      "reward": 1.3433035612106323,
+      "reward_std": 0.21848861873149872,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3433035910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.41058602929115295,
+      "step": 2571
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2091.0,
+      "completions/max_terminated_length": 2091.0,
+      "completions/mean_length": 484.1250305175781,
+      "completions/mean_terminated_length": 484.1250305175781,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 2.6541140056744905,
+      "grad_norm": 0.7291446924209595,
+      "kl": 0.148193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0016,
+      "num_tokens": 303996482.0,
+      "reward": 1.5625001192092896,
+      "reward_std": 0.20619072020053864,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5625,
+      "rewards/curriculum_aware_reward_fn/std": 0.4297233819961548,
+      "step": 2572
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2233.0,
+      "completions/mean_length": 554.5267944335938,
+      "completions/mean_terminated_length": 522.6216430664062,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.6551457312354914,
+      "grad_norm": 0.6747497320175171,
+      "kl": 0.11474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0196,
+      "num_tokens": 304126818.0,
+      "reward": 1.4669644832611084,
+      "reward_std": 0.2268751561641693,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4669643044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.3921845555305481,
+      "step": 2573
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2438.0,
+      "completions/max_terminated_length": 2438.0,
+      "completions/mean_length": 534.7589721679688,
+      "completions/mean_terminated_length": 534.7589721679688,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 2.656177456796492,
+      "grad_norm": 0.6876260638237,
+      "kl": 0.1258544921875,
+      "learning_rate": 1e-06,
+      "loss": -0.001,
+      "num_tokens": 304252161.0,
+      "reward": 1.6241072416305542,
+      "reward_std": 0.2519568204879761,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6330356597900391,
+      "rewards/curriculum_aware_reward_fn/std": 0.37797191739082336,
+      "step": 2574
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2329.0,
+      "completions/max_terminated_length": 2329.0,
+      "completions/mean_length": 524.0357666015625,
+      "completions/mean_terminated_length": 524.0357666015625,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 2.657209182357493,
+      "grad_norm": 0.4913746416568756,
+      "kl": 0.14208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0323,
+      "num_tokens": 304370920.0,
+      "reward": 1.770535945892334,
+      "reward_std": 0.14546895027160645,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7705356478691101,
+      "rewards/curriculum_aware_reward_fn/std": 0.35852259397506714,
+      "step": 2575
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3775.0,
+      "completions/max_terminated_length": 3775.0,
+      "completions/mean_length": 506.794677734375,
+      "completions/mean_terminated_length": 506.794677734375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 2.6582409079184934,
+      "grad_norm": 0.6894189715385437,
+      "kl": 0.131591796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0248,
+      "num_tokens": 304489288.0,
+      "reward": 1.5232144594192505,
+      "reward_std": 0.16796723008155823,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5232142806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.43100178241729736,
+      "step": 2576
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2335.0,
+      "completions/mean_length": 689.5000610351562,
+      "completions/mean_terminated_length": 658.8108520507812,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 2.6592726334794943,
+      "grad_norm": 0.6391696929931641,
+      "kl": 0.118896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0477,
+      "num_tokens": 304634608.0,
+      "reward": 1.2937500476837158,
+      "reward_std": 0.2136712223291397,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.400232195854187,
+      "step": 2577
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1691.0,
+      "completions/max_terminated_length": 1691.0,
+      "completions/mean_length": 562.8303833007812,
+      "completions/mean_terminated_length": 562.8303833007812,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.660304359040495,
+      "grad_norm": 0.5994539260864258,
+      "kl": 0.1280517578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0754,
+      "num_tokens": 304763151.0,
+      "reward": 1.520982265472412,
+      "reward_std": 0.14971788227558136,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5209821462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.4159347414970398,
+      "step": 2578
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1690.0,
+      "completions/max_terminated_length": 1690.0,
+      "completions/mean_length": 548.9732666015625,
+      "completions/mean_terminated_length": 548.9732666015625,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 2.661336084601496,
+      "grad_norm": 0.5327616930007935,
+      "kl": 0.1219482421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0258,
+      "num_tokens": 304882141.0,
+      "reward": 1.5861608982086182,
+      "reward_std": 0.12342019379138947,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5861606597900391,
+      "rewards/curriculum_aware_reward_fn/std": 0.4475765526294708,
+      "step": 2579
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3053.0,
+      "completions/mean_length": 714.8839721679688,
+      "completions/mean_terminated_length": 684.4234619140625,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 2.662367810162497,
+      "grad_norm": 0.6151833534240723,
+      "kl": 0.1142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0252,
+      "num_tokens": 305038725.0,
+      "reward": 1.4794644117355347,
+      "reward_std": 0.23122107982635498,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4794642925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.40372997522354126,
+      "step": 2580
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3521.0,
+      "completions/max_terminated_length": 3521.0,
+      "completions/mean_length": 678.5982666015625,
+      "completions/mean_terminated_length": 678.5982666015625,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "epoch": 2.6633995357234976,
+      "grad_norm": 0.6278223991394043,
+      "kl": 0.115478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0023,
+      "num_tokens": 305184987.0,
+      "reward": 1.4450894594192505,
+      "reward_std": 0.23164869844913483,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.3950687050819397,
+      "step": 2581
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2595.0,
+      "completions/mean_length": 720.169677734375,
+      "completions/mean_terminated_length": 689.7567749023438,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 2.6644312612844985,
+      "grad_norm": 0.5630807280540466,
+      "kl": 0.1201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0031,
+      "num_tokens": 305335114.0,
+      "reward": 1.3691965341567993,
+      "reward_std": 0.1900208741426468,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36919641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.38527193665504456,
+      "step": 2582
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3228.0,
+      "completions/mean_length": 759.3750610351562,
+      "completions/mean_terminated_length": 698.7090454101562,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.6654629868454993,
+      "grad_norm": 0.5652838945388794,
+      "kl": 0.105224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0295,
+      "num_tokens": 305485075.0,
+      "reward": 1.4968751668930054,
+      "reward_std": 0.21129049360752106,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5147321224212646,
+      "rewards/curriculum_aware_reward_fn/std": 0.3854702413082123,
+      "step": 2583
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2923.0,
+      "completions/mean_length": 713.6875610351562,
+      "completions/mean_terminated_length": 652.19091796875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 2.6664947124064997,
+      "grad_norm": 0.5213630199432373,
+      "kl": 0.1085205078125,
+      "learning_rate": 1e-06,
+      "loss": -0.0057,
+      "num_tokens": 305636870.0,
+      "reward": 1.5491071939468384,
+      "reward_std": 0.14842262864112854,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5491071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.40022414922714233,
+      "step": 2584
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2775.0,
+      "completions/max_terminated_length": 2775.0,
+      "completions/mean_length": 682.5089721679688,
+      "completions/mean_terminated_length": 682.5089721679688,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.6675264379675006,
+      "grad_norm": 0.6713928580284119,
+      "kl": 0.12060546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0121,
+      "num_tokens": 305781638.0,
+      "reward": 1.3986607789993286,
+      "reward_std": 0.2590733766555786,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40758925676345825,
+      "rewards/curriculum_aware_reward_fn/std": 0.4249117374420166,
+      "step": 2585
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2828.0,
+      "completions/max_terminated_length": 2828.0,
+      "completions/mean_length": 712.2767944335938,
+      "completions/mean_terminated_length": 712.2767944335938,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.6685581635285014,
+      "grad_norm": 0.5545308589935303,
+      "kl": 0.118896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0429,
+      "num_tokens": 305926933.0,
+      "reward": 1.5424107313156128,
+      "reward_std": 0.2068289816379547,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.3718840479850769,
+      "step": 2586
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3692.0,
+      "completions/max_terminated_length": 3692.0,
+      "completions/mean_length": 728.669677734375,
+      "completions/mean_terminated_length": 728.669677734375,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.6695898890895022,
+      "grad_norm": 0.5126776695251465,
+      "kl": 0.111083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 306075366.0,
+      "reward": 1.410267949104309,
+      "reward_std": 0.19241558015346527,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4102678596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.4012446105480194,
+      "step": 2587
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1849.0,
+      "completions/mean_length": 641.0178833007812,
+      "completions/mean_terminated_length": 609.8919067382812,
+      "completions/min_length": 121.0,
+      "completions/min_terminated_length": 121.0,
+      "epoch": 2.670621614650503,
+      "grad_norm": 0.535484254360199,
+      "kl": 0.1072998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0035,
+      "num_tokens": 306211392.0,
+      "reward": 1.4098213911056519,
+      "reward_std": 0.1677635759115219,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40982145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.433030366897583,
+      "step": 2588
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2886.0,
+      "completions/max_terminated_length": 2886.0,
+      "completions/mean_length": 670.8125,
+      "completions/mean_terminated_length": 670.8125,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 2.6716533402115035,
+      "grad_norm": 0.5733050107955933,
+      "kl": 0.105712890625,
+      "learning_rate": 1e-06,
+      "loss": -0.009,
+      "num_tokens": 306352806.0,
+      "reward": 1.5258928537368774,
+      "reward_std": 0.15302883088588715,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.41394510865211487,
+      "step": 2589
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2376.0,
+      "completions/mean_length": 740.1160888671875,
+      "completions/mean_terminated_length": 709.8828735351562,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 2.6726850657725043,
+      "grad_norm": 0.5498374700546265,
+      "kl": 0.1051025390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0322,
+      "num_tokens": 306506861.0,
+      "reward": 1.5299108028411865,
+      "reward_std": 0.23045654594898224,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.529910683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.40821850299835205,
+      "step": 2590
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3696.0,
+      "completions/max_terminated_length": 3696.0,
+      "completions/mean_length": 621.982177734375,
+      "completions/mean_terminated_length": 621.982177734375,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 2.673716791333505,
+      "grad_norm": 0.6307698488235474,
+      "kl": 0.1104736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0456,
+      "num_tokens": 306633118.0,
+      "reward": 1.675446629524231,
+      "reward_std": 0.2829442024230957,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6754463911056519,
+      "rewards/curriculum_aware_reward_fn/std": 0.5178783535957336,
+      "step": 2591
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2641.0,
+      "completions/mean_length": 717.9285888671875,
+      "completions/mean_terminated_length": 687.4954833984375,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 2.674748516894506,
+      "grad_norm": 0.4680657684803009,
+      "kl": 0.1002197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0183,
+      "num_tokens": 306780957.0,
+      "reward": 1.5120537281036377,
+      "reward_std": 0.2273402363061905,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5209821462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.5374547243118286,
+      "step": 2592
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2491.0,
+      "completions/mean_length": 657.1339721679688,
+      "completions/mean_terminated_length": 626.1531982421875,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "epoch": 2.675780242455507,
+      "grad_norm": 0.5426839590072632,
+      "kl": 0.1026611328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0374,
+      "num_tokens": 306921364.0,
+      "reward": 1.3901787996292114,
+      "reward_std": 0.1473875343799591,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3901785910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.42218008637428284,
+      "step": 2593
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2568.0,
+      "completions/max_terminated_length": 2568.0,
+      "completions/mean_length": 586.9107666015625,
+      "completions/mean_terminated_length": 586.9107666015625,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.6768119680165077,
+      "grad_norm": 0.5409964323043823,
+      "kl": 0.1136474609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0101,
+      "num_tokens": 307055705.0,
+      "reward": 1.557142972946167,
+      "reward_std": 0.16628770530223846,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5571428537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.4004421532154083,
+      "step": 2594
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2072.0,
+      "completions/mean_length": 598.7232666015625,
+      "completions/mean_terminated_length": 567.2162475585938,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.6778436935775085,
+      "grad_norm": 0.5737888813018799,
+      "kl": 0.1060791015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0354,
+      "num_tokens": 307191040.0,
+      "reward": 1.4977679252624512,
+      "reward_std": 0.14505282044410706,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4977678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.449142724275589,
+      "step": 2595
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3766.0,
+      "completions/max_terminated_length": 3766.0,
+      "completions/mean_length": 663.5535888671875,
+      "completions/mean_terminated_length": 663.5535888671875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 2.6788754191385094,
+      "grad_norm": 0.619125485420227,
+      "kl": 0.112548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 307330570.0,
+      "reward": 1.4647324085235596,
+      "reward_std": 0.22834059596061707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46473217010498047,
+      "rewards/curriculum_aware_reward_fn/std": 0.42466938495635986,
+      "step": 2596
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3225.0,
+      "completions/mean_length": 693.1607666015625,
+      "completions/mean_terminated_length": 662.5045166015625,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 2.67990714469951,
+      "grad_norm": 0.4550981819629669,
+      "kl": 0.0999755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0339,
+      "num_tokens": 307474676.0,
+      "reward": 1.533928632736206,
+      "reward_std": 0.11954943090677261,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5339285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.45208296179771423,
+      "step": 2597
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1921.0,
+      "completions/max_terminated_length": 1921.0,
+      "completions/mean_length": 548.3303833007812,
+      "completions/mean_terminated_length": 548.3303833007812,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 2.6809388702605106,
+      "grad_norm": 0.6486082673072815,
+      "kl": 0.1171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0423,
+      "num_tokens": 307594133.0,
+      "reward": 1.4955357313156128,
+      "reward_std": 0.19790183007717133,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4955357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.40177270770072937,
+      "step": 2598
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1431.0,
+      "completions/max_terminated_length": 1431.0,
+      "completions/mean_length": 614.2857666015625,
+      "completions/mean_terminated_length": 614.2857666015625,
+      "completions/min_length": 145.0,
+      "completions/min_terminated_length": 145.0,
+      "epoch": 2.6819705958215114,
+      "grad_norm": 0.6386866569519043,
+      "kl": 0.10986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0095,
+      "num_tokens": 307734018.0,
+      "reward": 1.4941965341567993,
+      "reward_std": 0.18490223586559296,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49419641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.43323346972465515,
+      "step": 2599
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3371.0,
+      "completions/max_terminated_length": 3371.0,
+      "completions/mean_length": 541.107177734375,
+      "completions/mean_terminated_length": 541.107177734375,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.6830023213825123,
+      "grad_norm": 0.6429569125175476,
+      "kl": 0.1248779296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0343,
+      "num_tokens": 307858861.0,
+      "reward": 1.5316965579986572,
+      "reward_std": 0.1632005274295807,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5316964387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.41940516233444214,
+      "step": 2600
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1910.0,
+      "completions/max_terminated_length": 1910.0,
+      "completions/mean_length": 525.919677734375,
+      "completions/mean_terminated_length": 525.919677734375,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.684034046943513,
+      "grad_norm": 0.6729943752288818,
+      "kl": 0.11474609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0122,
+      "num_tokens": 307981541.0,
+      "reward": 1.5808037519454956,
+      "reward_std": 0.23689059913158417,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5808035731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.42501965165138245,
+      "step": 2601
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1432.0,
+      "completions/max_terminated_length": 1432.0,
+      "completions/mean_length": 522.9285888671875,
+      "completions/mean_terminated_length": 522.9285888671875,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 2.685065772504514,
+      "grad_norm": 0.5555217862129211,
+      "kl": 0.108154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0684,
+      "num_tokens": 308102395.0,
+      "reward": 1.6214287281036377,
+      "reward_std": 0.1849846988916397,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6303571462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.4196929931640625,
+      "step": 2602
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3274.0,
+      "completions/max_terminated_length": 3274.0,
+      "completions/mean_length": 547.8482666015625,
+      "completions/mean_terminated_length": 547.8482666015625,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 2.6860974980655143,
+      "grad_norm": 0.6727123856544495,
+      "kl": 0.113037109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0696,
+      "num_tokens": 308227496.0,
+      "reward": 1.6245537996292114,
+      "reward_std": 0.19118796288967133,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6245535612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.4422503709793091,
+      "step": 2603
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1702.0,
+      "completions/max_terminated_length": 1702.0,
+      "completions/mean_length": 525.8660888671875,
+      "completions/mean_terminated_length": 525.8660888671875,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 2.687129223626515,
+      "grad_norm": 0.6231337189674377,
+      "kl": 0.1219482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0053,
+      "num_tokens": 308362511.0,
+      "reward": 1.5691964626312256,
+      "reward_std": 0.1808921843767166,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5691964030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.45053747296333313,
+      "step": 2604
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3166.0,
+      "completions/max_terminated_length": 3166.0,
+      "completions/mean_length": 567.9732666015625,
+      "completions/mean_terminated_length": 567.9732666015625,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.688160949187516,
+      "grad_norm": 0.5709600448608398,
+      "kl": 0.112060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0025,
+      "num_tokens": 308504111.0,
+      "reward": 1.540178656578064,
+      "reward_std": 0.14044509828090668,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5491071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.45167267322540283,
+      "step": 2605
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2727.0,
+      "completions/max_terminated_length": 2727.0,
+      "completions/mean_length": 446.9732360839844,
+      "completions/mean_terminated_length": 446.9732360839844,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 2.689192674748517,
+      "grad_norm": 0.711565375328064,
+      "kl": 0.111083984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0235,
+      "num_tokens": 308615912.0,
+      "reward": 1.7169643640518188,
+      "reward_std": 0.17919939756393433,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7169643044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.4117942154407501,
+      "step": 2606
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1516.0,
+      "completions/max_terminated_length": 1516.0,
+      "completions/mean_length": 549.8660888671875,
+      "completions/mean_terminated_length": 549.8660888671875,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 2.6902244003095177,
+      "grad_norm": 0.6628137826919556,
+      "kl": 0.11865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0278,
+      "num_tokens": 308738551.0,
+      "reward": 1.5236608982086182,
+      "reward_std": 0.2348913550376892,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5236607193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.4373837113380432,
+      "step": 2607
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1675.0,
+      "completions/mean_length": 593.5089721679688,
+      "completions/mean_terminated_length": 561.9549560546875,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 2.6912561258705185,
+      "grad_norm": 0.6050135493278503,
+      "kl": 0.104248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0199,
+      "num_tokens": 308877957.0,
+      "reward": 1.4187501668930054,
+      "reward_std": 0.23862913250923157,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.4206836521625519,
+      "step": 2608
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3093.0,
+      "completions/max_terminated_length": 3093.0,
+      "completions/mean_length": 484.8750305175781,
+      "completions/mean_terminated_length": 484.8750305175781,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.6922878514315194,
+      "grad_norm": 0.6855359673500061,
+      "kl": 0.123291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0385,
+      "num_tokens": 308991490.0,
+      "reward": 1.5218751430511475,
+      "reward_std": 0.20726396143436432,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.4614792466163635,
+      "step": 2609
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2536.0,
+      "completions/max_terminated_length": 2536.0,
+      "completions/mean_length": 549.6964721679688,
+      "completions/mean_terminated_length": 549.6964721679688,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "epoch": 2.6933195769925202,
+      "grad_norm": 0.6386505961418152,
+      "kl": 0.1151123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0554,
+      "num_tokens": 309129699.0,
+      "reward": 1.6308035850524902,
+      "reward_std": 0.20738892257213593,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6397321820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.4121541976928711,
+      "step": 2610
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2056.0,
+      "completions/max_terminated_length": 2056.0,
+      "completions/mean_length": 539.4732666015625,
+      "completions/mean_terminated_length": 539.4732666015625,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.6943513025535206,
+      "grad_norm": 0.6469184756278992,
+      "kl": 0.1051025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 309251454.0,
+      "reward": 1.4625000953674316,
+      "reward_std": 0.21683713793754578,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47142860293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.4458373188972473,
+      "step": 2611
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1871.0,
+      "completions/max_terminated_length": 1871.0,
+      "completions/mean_length": 487.669677734375,
+      "completions/mean_terminated_length": 487.669677734375,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 2.6953830281145215,
+      "grad_norm": 0.6858476400375366,
+      "kl": 0.1148681640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0191,
+      "num_tokens": 309375723.0,
+      "reward": 1.3611608743667603,
+      "reward_std": 0.1952558159828186,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3611606955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.3955672085285187,
+      "step": 2612
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1518.0,
+      "completions/max_terminated_length": 1518.0,
+      "completions/mean_length": 524.4910888671875,
+      "completions/mean_terminated_length": 524.4910888671875,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 2.6964147536755223,
+      "grad_norm": 0.7016357183456421,
+      "kl": 0.129638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0334,
+      "num_tokens": 309501393.0,
+      "reward": 1.4205358028411865,
+      "reward_std": 0.1922624260187149,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42053571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3992421627044678,
+      "step": 2613
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2341.0,
+      "completions/max_terminated_length": 2341.0,
+      "completions/mean_length": 634.7410888671875,
+      "completions/mean_terminated_length": 634.7410888671875,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 2.697446479236523,
+      "grad_norm": 0.6351801156997681,
+      "kl": 0.1048583984375,
+      "learning_rate": 1e-06,
+      "loss": -0.035,
+      "num_tokens": 309633706.0,
+      "reward": 1.4013392925262451,
+      "reward_std": 0.21559014916419983,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40133926272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.4269871115684509,
+      "step": 2614
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1732.0,
+      "completions/max_terminated_length": 1732.0,
+      "completions/mean_length": 546.3928833007812,
+      "completions/mean_terminated_length": 546.3928833007812,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 2.698478204797524,
+      "grad_norm": 0.7185891270637512,
+      "kl": 0.13037109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0217,
+      "num_tokens": 309758427.0,
+      "reward": 1.3339287042617798,
+      "reward_std": 0.181352436542511,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33392858505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3564220666885376,
+      "step": 2615
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1137.0,
+      "completions/max_terminated_length": 1137.0,
+      "completions/mean_length": 467.33929443359375,
+      "completions/mean_terminated_length": 467.33929443359375,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.6995099303585244,
+      "grad_norm": 0.6989985704421997,
+      "kl": 0.127685546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 309878459.0,
+      "reward": 1.5276787281036377,
+      "reward_std": 0.2159861922264099,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5366071462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.4386703073978424,
+      "step": 2616
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1435.0,
+      "completions/mean_length": 503.3214416503906,
+      "completions/mean_terminated_length": 470.9549560546875,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 2.700541655919525,
+      "grad_norm": 0.5956876277923584,
+      "kl": 0.110107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.013,
+      "num_tokens": 310002801.0,
+      "reward": 1.591071605682373,
+      "reward_std": 0.143727108836174,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5910714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.42225342988967896,
+      "step": 2617
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2377.0,
+      "completions/max_terminated_length": 2377.0,
+      "completions/mean_length": 579.8482666015625,
+      "completions/mean_terminated_length": 579.8482666015625,
+      "completions/min_length": 166.0,
+      "completions/min_terminated_length": 166.0,
+      "epoch": 2.701573381480526,
+      "grad_norm": 0.7142231464385986,
+      "kl": 0.1292724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0184,
+      "num_tokens": 310140138.0,
+      "reward": 1.5075894594192505,
+      "reward_std": 0.1956642121076584,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5075892806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.4249117374420166,
+      "step": 2618
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2371.0,
+      "completions/max_terminated_length": 2371.0,
+      "completions/mean_length": 528.4375,
+      "completions/mean_terminated_length": 528.4375,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.702605107041527,
+      "grad_norm": 0.6770662069320679,
+      "kl": 0.104248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0519,
+      "num_tokens": 310259243.0,
+      "reward": 1.4629465341567993,
+      "reward_std": 0.17913131415843964,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47187501192092896,
+      "rewards/curriculum_aware_reward_fn/std": 0.42937105894088745,
+      "step": 2619
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1923.0,
+      "completions/max_terminated_length": 1923.0,
+      "completions/mean_length": 538.919677734375,
+      "completions/mean_terminated_length": 538.919677734375,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 2.7036368326025277,
+      "grad_norm": 0.7300458550453186,
+      "kl": 0.1259765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0344,
+      "num_tokens": 310384121.0,
+      "reward": 1.4950894117355347,
+      "reward_std": 0.20447319746017456,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.3622974455356598,
+      "step": 2620
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1927.0,
+      "completions/max_terminated_length": 1927.0,
+      "completions/mean_length": 550.2142944335938,
+      "completions/mean_terminated_length": 550.2142944335938,
+      "completions/min_length": 142.0,
+      "completions/min_terminated_length": 142.0,
+      "epoch": 2.7046685581635286,
+      "grad_norm": 0.647113561630249,
+      "kl": 0.1112060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0539,
+      "num_tokens": 310508759.0,
+      "reward": 1.5254465341567993,
+      "reward_std": 0.2031795084476471,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.4302730858325958,
+      "step": 2621
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3646.0,
+      "completions/max_terminated_length": 3646.0,
+      "completions/mean_length": 577.2142944335938,
+      "completions/mean_terminated_length": 577.2142944335938,
+      "completions/min_length": 277.0,
+      "completions/min_terminated_length": 277.0,
+      "epoch": 2.7057002837245294,
+      "grad_norm": 4.672196865081787,
+      "kl": 0.124755859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0161,
+      "num_tokens": 310644422.0,
+      "reward": 1.4906251430511475,
+      "reward_std": 0.19837768375873566,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4906250536441803,
+      "rewards/curriculum_aware_reward_fn/std": 0.3786255121231079,
+      "step": 2622
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1268.0,
+      "completions/max_terminated_length": 1268.0,
+      "completions/mean_length": 491.669677734375,
+      "completions/mean_terminated_length": 491.669677734375,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.7067320092855303,
+      "grad_norm": 0.5772094130516052,
+      "kl": 0.125244140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0382,
+      "num_tokens": 310763245.0,
+      "reward": 1.5593750476837158,
+      "reward_std": 0.15548592805862427,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.559374988079071,
+      "rewards/curriculum_aware_reward_fn/std": 0.3872129023075104,
+      "step": 2623
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1660.0,
+      "completions/max_terminated_length": 1660.0,
+      "completions/mean_length": 440.0625305175781,
+      "completions/mean_terminated_length": 440.0625305175781,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.7077637348465307,
+      "grad_norm": 0.700062096118927,
+      "kl": 0.13427734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0094,
+      "num_tokens": 310881678.0,
+      "reward": 1.3589287996292114,
+      "reward_std": 0.20154716074466705,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3678571581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.3995412290096283,
+      "step": 2624
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1808.0,
+      "completions/max_terminated_length": 1808.0,
+      "completions/mean_length": 487.0714416503906,
+      "completions/mean_terminated_length": 487.0714416503906,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 2.7087954604075315,
+      "grad_norm": 0.5862681269645691,
+      "kl": 0.119140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0409,
+      "num_tokens": 310998180.0,
+      "reward": 1.5602679252624512,
+      "reward_std": 0.15142853558063507,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5602678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4182297885417938,
+      "step": 2625
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2639.0,
+      "completions/max_terminated_length": 2639.0,
+      "completions/mean_length": 518.1517944335938,
+      "completions/mean_terminated_length": 518.1517944335938,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.7098271859685323,
+      "grad_norm": 0.7043747901916504,
+      "kl": 0.123779296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0186,
+      "num_tokens": 311122881.0,
+      "reward": 1.4928573369979858,
+      "reward_std": 0.20403318107128143,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4928571879863739,
+      "rewards/curriculum_aware_reward_fn/std": 0.42036324739456177,
+      "step": 2626
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2773.0,
+      "completions/mean_length": 614.2857666015625,
+      "completions/mean_terminated_length": 582.9189453125,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.710858911529533,
+      "grad_norm": 0.41442379355430603,
+      "kl": 0.1044921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0121,
+      "num_tokens": 311267316.0,
+      "reward": 1.4424108266830444,
+      "reward_std": 0.08449666202068329,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44241073727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.4626213014125824,
+      "step": 2627
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2518.0,
+      "completions/mean_length": 618.9285888671875,
+      "completions/mean_terminated_length": 587.6036376953125,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.711890637090534,
+      "grad_norm": 0.663968563079834,
+      "kl": 0.1131591796875,
+      "learning_rate": 1e-06,
+      "loss": -0.003,
+      "num_tokens": 311416212.0,
+      "reward": 1.504910945892334,
+      "reward_std": 0.23447324335575104,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5049107670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.40380844473838806,
+      "step": 2628
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1670.0,
+      "completions/max_terminated_length": 1670.0,
+      "completions/mean_length": 483.4910888671875,
+      "completions/mean_terminated_length": 483.4910888671875,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.712922362651535,
+      "grad_norm": 148691.9375,
+      "kl": 2608.08203125,
+      "learning_rate": 1e-06,
+      "loss": 26.2699,
+      "num_tokens": 311540013.0,
+      "reward": 1.6138393878936768,
+      "reward_std": 0.16646739840507507,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6227678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4234078824520111,
+      "step": 2629
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1201.0,
+      "completions/max_terminated_length": 1201.0,
+      "completions/mean_length": 457.5982360839844,
+      "completions/mean_terminated_length": 457.5982360839844,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.7139540882125353,
+      "grad_norm": 0.6651673316955566,
+      "kl": 0.124755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0061,
+      "num_tokens": 311651317.0,
+      "reward": 1.5125001668930054,
+      "reward_std": 0.18927127122879028,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.512499988079071,
+      "rewards/curriculum_aware_reward_fn/std": 0.45393672585487366,
+      "step": 2630
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3100.0,
+      "completions/max_terminated_length": 3100.0,
+      "completions/mean_length": 535.6160888671875,
+      "completions/mean_terminated_length": 535.6160888671875,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.714985813773536,
+      "grad_norm": 0.7526648640632629,
+      "kl": 0.123291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0602,
+      "num_tokens": 311778660.0,
+      "reward": 1.4522321224212646,
+      "reward_std": 0.20325230062007904,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.381579726934433,
+      "step": 2631
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1229.0,
+      "completions/max_terminated_length": 1229.0,
+      "completions/mean_length": 466.107177734375,
+      "completions/mean_terminated_length": 466.107177734375,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 2.716017539334537,
+      "grad_norm": 0.6304461359977722,
+      "kl": 0.118408203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0294,
+      "num_tokens": 311901872.0,
+      "reward": 1.5357143878936768,
+      "reward_std": 0.2242029458284378,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.5000643134117126,
+      "step": 2632
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1372.0,
+      "completions/max_terminated_length": 1372.0,
+      "completions/mean_length": 473.419677734375,
+      "completions/mean_terminated_length": 473.419677734375,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 2.7170492648955378,
+      "grad_norm": 0.6883236169815063,
+      "kl": 0.1221923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0293,
+      "num_tokens": 312024517.0,
+      "reward": 1.7473214864730835,
+      "reward_std": 0.13166259229183197,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7473214268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.3198986053466797,
+      "step": 2633
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1199.0,
+      "completions/max_terminated_length": 1199.0,
+      "completions/mean_length": 423.71429443359375,
+      "completions/mean_terminated_length": 423.71429443359375,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.7180809904565386,
+      "grad_norm": 0.6701761484146118,
+      "kl": 0.108642578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0061,
+      "num_tokens": 312133825.0,
+      "reward": 1.769642949104309,
+      "reward_std": 0.14054904878139496,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7696428298950195,
+      "rewards/curriculum_aware_reward_fn/std": 0.3735848367214203,
+      "step": 2634
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1783.0,
+      "completions/max_terminated_length": 1783.0,
+      "completions/mean_length": 441.4464416503906,
+      "completions/mean_terminated_length": 441.4464416503906,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 2.7191127160175395,
+      "grad_norm": 0.7455883622169495,
+      "kl": 0.1475830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0452,
+      "num_tokens": 312248696.0,
+      "reward": 1.4531251192092896,
+      "reward_std": 0.16912443935871124,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.453125,
+      "rewards/curriculum_aware_reward_fn/std": 0.42681944370269775,
+      "step": 2635
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1149.0,
+      "completions/mean_length": 571.0357666015625,
+      "completions/mean_terminated_length": 474.0183410644531,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 2.7201444415785403,
+      "grad_norm": 0.6181263327598572,
+      "kl": 0.11181640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0085,
+      "num_tokens": 312384815.0,
+      "reward": 1.5370537042617798,
+      "reward_std": 0.20239168405532837,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5370535850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4414220154285431,
+      "step": 2636
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3605.0,
+      "completions/mean_length": 512.0714721679688,
+      "completions/mean_terminated_length": 446.9090881347656,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.721176167139541,
+      "grad_norm": 0.6179798245429993,
+      "kl": 0.1160888671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0371,
+      "num_tokens": 312509374.0,
+      "reward": 1.5450894832611084,
+      "reward_std": 0.1635485589504242,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5450892448425293,
+      "rewards/curriculum_aware_reward_fn/std": 0.43178656697273254,
+      "step": 2637
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1103.0,
+      "completions/mean_length": 531.9375,
+      "completions/mean_terminated_length": 499.8288269042969,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.7222078927005415,
+      "grad_norm": 0.6158668994903564,
+      "kl": 0.1302490234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0375,
+      "num_tokens": 312638909.0,
+      "reward": 1.4093750715255737,
+      "reward_std": 0.16171173751354218,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41830357909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.43262219429016113,
+      "step": 2638
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1644.0,
+      "completions/max_terminated_length": 1644.0,
+      "completions/mean_length": 457.15179443359375,
+      "completions/mean_terminated_length": 457.15179443359375,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 2.7232396182615424,
+      "grad_norm": 0.7206525206565857,
+      "kl": 0.1240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0109,
+      "num_tokens": 312757548.0,
+      "reward": 1.4991071224212646,
+      "reward_std": 0.16917501389980316,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49910715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.4314746558666229,
+      "step": 2639
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2808.0,
+      "completions/max_terminated_length": 2808.0,
+      "completions/mean_length": 410.27679443359375,
+      "completions/mean_terminated_length": 410.27679443359375,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 2.724271343822543,
+      "grad_norm": 0.7098089456558228,
+      "kl": 0.1295166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.053,
+      "num_tokens": 312866520.0,
+      "reward": 1.6053574085235596,
+      "reward_std": 0.20159611105918884,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6053571701049805,
+      "rewards/curriculum_aware_reward_fn/std": 0.4335770606994629,
+      "step": 2640
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1616.0,
+      "completions/max_terminated_length": 1616.0,
+      "completions/mean_length": 533.107177734375,
+      "completions/mean_terminated_length": 533.107177734375,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.725303069383544,
+      "grad_norm": 0.6682390570640564,
+      "kl": 0.1396484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 313002325.0,
+      "reward": 1.345089316368103,
+      "reward_std": 0.18961378931999207,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.34508928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.36415764689445496,
+      "step": 2641
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2577.0,
+      "completions/max_terminated_length": 2577.0,
+      "completions/mean_length": 557.3035888671875,
+      "completions/mean_terminated_length": 557.3035888671875,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 2.726334794944545,
+      "grad_norm": 0.5817402601242065,
+      "kl": 0.1025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0154,
+      "num_tokens": 313133861.0,
+      "reward": 1.4174107313156128,
+      "reward_std": 0.18837326765060425,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4174107611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.4443189799785614,
+      "step": 2642
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1767.0,
+      "completions/max_terminated_length": 1767.0,
+      "completions/mean_length": 505.8482360839844,
+      "completions/mean_terminated_length": 505.8482360839844,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 2.7273665205055453,
+      "grad_norm": 0.7893069982528687,
+      "kl": 0.126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0055,
+      "num_tokens": 313251360.0,
+      "reward": 1.5683037042617798,
+      "reward_std": 0.22728142142295837,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5683035254478455,
+      "rewards/curriculum_aware_reward_fn/std": 0.3967205286026001,
+      "step": 2643
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1756.0,
+      "completions/max_terminated_length": 1756.0,
+      "completions/mean_length": 526.2053833007812,
+      "completions/mean_terminated_length": 526.2053833007812,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 2.728398246066546,
+      "grad_norm": 1.0762910842895508,
+      "kl": 0.125,
+      "learning_rate": 1e-06,
+      "loss": -0.0875,
+      "num_tokens": 313373239.0,
+      "reward": 1.5075894594192505,
+      "reward_std": 0.2532075047492981,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5165178179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.5285279154777527,
+      "step": 2644
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1864.0,
+      "completions/max_terminated_length": 1864.0,
+      "completions/mean_length": 533.5803833007812,
+      "completions/mean_terminated_length": 533.5803833007812,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 2.729429971627547,
+      "grad_norm": 0.5750779509544373,
+      "kl": 0.1351318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 313503714.0,
+      "reward": 1.6111608743667603,
+      "reward_std": 0.21896274387836456,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6111606955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.4266309142112732,
+      "step": 2645
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3007.0,
+      "completions/max_terminated_length": 3007.0,
+      "completions/mean_length": 578.0803833007812,
+      "completions/mean_terminated_length": 578.0803833007812,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "epoch": 2.730461697188548,
+      "grad_norm": 0.5010013580322266,
+      "kl": 0.13671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0382,
+      "num_tokens": 313643527.0,
+      "reward": 1.4477678537368774,
+      "reward_std": 0.1527036726474762,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47455358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.43397367000579834,
+      "step": 2646
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2310.0,
+      "completions/mean_length": 656.5714721679688,
+      "completions/mean_terminated_length": 594.0363159179688,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.7314934227495486,
+      "grad_norm": 0.5431753396987915,
+      "kl": 0.1512451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0567,
+      "num_tokens": 313783927.0,
+      "reward": 1.419196605682373,
+      "reward_std": 0.18558138608932495,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4459821581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.41313666105270386,
+      "step": 2647
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2747.0,
+      "completions/mean_length": 712.2500610351562,
+      "completions/mean_terminated_length": 681.7658081054688,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 2.7325251483105495,
+      "grad_norm": 0.5787481069564819,
+      "kl": 0.123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.039,
+      "num_tokens": 313927588.0,
+      "reward": 1.3513394594192505,
+      "reward_std": 0.17803417146205902,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36026784777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.37860211730003357,
+      "step": 2648
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2207.0,
+      "completions/max_terminated_length": 2207.0,
+      "completions/mean_length": 581.9910888671875,
+      "completions/mean_terminated_length": 581.9910888671875,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.7335568738715503,
+      "grad_norm": 0.6699755787849426,
+      "kl": 0.156494140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0011,
+      "num_tokens": 314061700.0,
+      "reward": 1.5098215341567993,
+      "reward_std": 0.30378657579421997,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5098214149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.4118645489215851,
+      "step": 2649
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3618.0,
+      "completions/mean_length": 695.2678833007812,
+      "completions/mean_terminated_length": 633.4363403320312,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 2.734588599432551,
+      "grad_norm": 0.4774550497531891,
+      "kl": 0.133056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0942,
+      "num_tokens": 314211089.0,
+      "reward": 1.4571430683135986,
+      "reward_std": 0.13818678259849548,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47499996423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.4217616617679596,
+      "step": 2650
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 4059.0,
+      "completions/max_terminated_length": 4059.0,
+      "completions/mean_length": 576.4642944335938,
+      "completions/mean_terminated_length": 576.4642944335938,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 2.7356203249935516,
+      "grad_norm": 0.4988778531551361,
+      "kl": 0.145263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0311,
+      "num_tokens": 314337967.0,
+      "reward": 1.5928571224212646,
+      "reward_std": 0.25025662779808044,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.610714316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.524036169052124,
+      "step": 2651
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2997.0,
+      "completions/mean_length": 699.5267944335938,
+      "completions/mean_terminated_length": 637.772705078125,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.7366520505545524,
+      "grad_norm": 0.5151804685592651,
+      "kl": 0.124267578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0272,
+      "num_tokens": 314487112.0,
+      "reward": 1.4821430444717407,
+      "reward_std": 0.23278112709522247,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5,
+      "rewards/curriculum_aware_reward_fn/std": 0.43723124265670776,
+      "step": 2652
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1494.0,
+      "completions/max_terminated_length": 1494.0,
+      "completions/mean_length": 571.2678833007812,
+      "completions/mean_terminated_length": 571.2678833007812,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.7376837761155532,
+      "grad_norm": 0.5588393211364746,
+      "kl": 0.1396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0351,
+      "num_tokens": 314613730.0,
+      "reward": 1.6312501430511475,
+      "reward_std": 0.23117785155773163,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1621822714805603,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6580356955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.42598870396614075,
+      "step": 2653
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1377.0,
+      "completions/max_terminated_length": 1377.0,
+      "completions/mean_length": 511.7500305175781,
+      "completions/mean_terminated_length": 511.7500305175781,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 2.738715501676554,
+      "grad_norm": 0.6131327748298645,
+      "kl": 0.15673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0306,
+      "num_tokens": 314738428.0,
+      "reward": 1.487946629524231,
+      "reward_std": 0.188005730509758,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5058035850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4395301043987274,
+      "step": 2654
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1420.0,
+      "completions/max_terminated_length": 1420.0,
+      "completions/mean_length": 545.9017944335938,
+      "completions/mean_terminated_length": 545.9017944335938,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 2.739747227237555,
+      "grad_norm": 0.6534278392791748,
+      "kl": 0.14794921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0196,
+      "num_tokens": 314865145.0,
+      "reward": 1.4830358028411865,
+      "reward_std": 0.29470551013946533,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48303571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.41190358996391296,
+      "step": 2655
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2448.0,
+      "completions/max_terminated_length": 2448.0,
+      "completions/mean_length": 597.357177734375,
+      "completions/mean_terminated_length": 597.357177734375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.7407789527985553,
+      "grad_norm": 0.5453312993049622,
+      "kl": 0.144775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0692,
+      "num_tokens": 314995868.0,
+      "reward": 1.5629465579986572,
+      "reward_std": 0.17557427287101746,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5629464387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.40037962794303894,
+      "step": 2656
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2168.0,
+      "completions/max_terminated_length": 2168.0,
+      "completions/mean_length": 551.7232666015625,
+      "completions/mean_terminated_length": 551.7232666015625,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.741810678359556,
+      "grad_norm": 0.5800078511238098,
+      "kl": 0.156005859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0146,
+      "num_tokens": 315122768.0,
+      "reward": 1.407142996788025,
+      "reward_std": 0.23318617045879364,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40714284777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.44198858737945557,
+      "step": 2657
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1561.0,
+      "completions/max_terminated_length": 1561.0,
+      "completions/mean_length": 497.919677734375,
+      "completions/mean_terminated_length": 497.919677734375,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 2.742842403920557,
+      "grad_norm": 0.5884579420089722,
+      "kl": 0.156982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 315244251.0,
+      "reward": 1.593750238418579,
+      "reward_std": 0.144059419631958,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.59375,
+      "rewards/curriculum_aware_reward_fn/std": 0.43599969148635864,
+      "step": 2658
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2169.0,
+      "completions/max_terminated_length": 2169.0,
+      "completions/mean_length": 517.5535888671875,
+      "completions/mean_terminated_length": 517.5535888671875,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 2.743874129481558,
+      "grad_norm": 0.5885396003723145,
+      "kl": 0.150146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0635,
+      "num_tokens": 315364020.0,
+      "reward": 1.4656251668930054,
+      "reward_std": 0.10819364339113235,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.43058890104293823,
+      "step": 2659
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1594.0,
+      "completions/mean_length": 617.232177734375,
+      "completions/mean_terminated_length": 553.9818115234375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.7449058550425587,
+      "grad_norm": 0.6162183284759521,
+      "kl": 0.15185546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0335,
+      "num_tokens": 315500742.0,
+      "reward": 1.3535715341567993,
+      "reward_std": 0.23010680079460144,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.4042288064956665,
+      "step": 2660
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1577.0,
+      "completions/mean_length": 586.7678833007812,
+      "completions/mean_terminated_length": 555.1531372070312,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 2.7459375806035595,
+      "grad_norm": 0.5109012126922607,
+      "kl": 0.15234375,
+      "learning_rate": 1e-06,
+      "loss": 0.019,
+      "num_tokens": 315637447.0,
+      "reward": 1.4986608028411865,
+      "reward_std": 0.12022580206394196,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49866071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4184624254703522,
+      "step": 2661
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1468.0,
+      "completions/mean_length": 581.5089721679688,
+      "completions/mean_terminated_length": 549.8468627929688,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 2.7469693061645604,
+      "grad_norm": 0.6965451240539551,
+      "kl": 0.146240234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0067,
+      "num_tokens": 315777224.0,
+      "reward": 1.5075894594192505,
+      "reward_std": 0.15963546931743622,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5075892806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.40859073400497437,
+      "step": 2662
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2698.0,
+      "completions/max_terminated_length": 2698.0,
+      "completions/mean_length": 538.8660888671875,
+      "completions/mean_terminated_length": 538.8660888671875,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.748001031725561,
+      "grad_norm": 0.6566807627677917,
+      "kl": 0.160400390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0306,
+      "num_tokens": 315907759.0,
+      "reward": 1.626339316368103,
+      "reward_std": 0.187081977725029,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6263392567634583,
+      "rewards/curriculum_aware_reward_fn/std": 0.44382449984550476,
+      "step": 2663
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1671.0,
+      "completions/max_terminated_length": 1671.0,
+      "completions/mean_length": 505.3214416503906,
+      "completions/mean_terminated_length": 505.3214416503906,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.749032757286562,
+      "grad_norm": 0.6896383762359619,
+      "kl": 0.1728515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0068,
+      "num_tokens": 316033662.0,
+      "reward": 1.60535728931427,
+      "reward_std": 0.19829097390174866,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6053571105003357,
+      "rewards/curriculum_aware_reward_fn/std": 0.5537487268447876,
+      "step": 2664
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1456.0,
+      "completions/max_terminated_length": 1456.0,
+      "completions/mean_length": 576.625,
+      "completions/mean_terminated_length": 576.625,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 2.7500644828475624,
+      "grad_norm": 0.6218982338905334,
+      "kl": 0.161865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0197,
+      "num_tokens": 316168118.0,
+      "reward": 1.5227679014205933,
+      "reward_std": 0.1973244547843933,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5227678418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.38054537773132324,
+      "step": 2665
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1675.0,
+      "completions/max_terminated_length": 1675.0,
+      "completions/mean_length": 563.9464721679688,
+      "completions/mean_terminated_length": 563.9464721679688,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 2.7510962084085633,
+      "grad_norm": 0.7110979557037354,
+      "kl": 0.17333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0263,
+      "num_tokens": 316296198.0,
+      "reward": 1.5325894355773926,
+      "reward_std": 0.2403583526611328,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.4220131039619446,
+      "step": 2666
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1559.0,
+      "completions/max_terminated_length": 1559.0,
+      "completions/mean_length": 504.76788330078125,
+      "completions/mean_terminated_length": 504.76788330078125,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 2.752127933969564,
+      "grad_norm": 0.7367867827415466,
+      "kl": 0.220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0293,
+      "num_tokens": 316425630.0,
+      "reward": 1.5066965818405151,
+      "reward_std": 0.1925990879535675,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5066964030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.4251388609409332,
+      "step": 2667
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2088.0,
+      "completions/max_terminated_length": 2088.0,
+      "completions/mean_length": 549.3214721679688,
+      "completions/mean_terminated_length": 549.3214721679688,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 2.753159659530565,
+      "grad_norm": 0.661249577999115,
+      "kl": 0.168701171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0191,
+      "num_tokens": 316558449.0,
+      "reward": 1.5776787996292114,
+      "reward_std": 0.3079984784126282,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5955356955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.40695273876190186,
+      "step": 2668
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3670.0,
+      "completions/max_terminated_length": 3670.0,
+      "completions/mean_length": 578.5535888671875,
+      "completions/mean_terminated_length": 578.5535888671875,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 2.754191385091566,
+      "grad_norm": 0.5973184704780579,
+      "kl": 0.14990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 316684805.0,
+      "reward": 1.5625001192092896,
+      "reward_std": 0.22135469317436218,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5714285969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.42973458766937256,
+      "step": 2669
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3237.0,
+      "completions/max_terminated_length": 3237.0,
+      "completions/mean_length": 671.0178833007812,
+      "completions/mean_terminated_length": 671.0178833007812,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 2.755223110652566,
+      "grad_norm": 0.6458407640457153,
+      "kl": 0.146484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0057,
+      "num_tokens": 316831023.0,
+      "reward": 1.447767972946167,
+      "reward_std": 0.3050350546836853,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44776788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.41173049807548523,
+      "step": 2670
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2096.0,
+      "completions/mean_length": 698.4285888671875,
+      "completions/mean_terminated_length": 636.654541015625,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 2.756254836213567,
+      "grad_norm": 0.6006747484207153,
+      "kl": 0.139892578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0067,
+      "num_tokens": 316986233.0,
+      "reward": 1.3883929252624512,
+      "reward_std": 0.22081252932548523,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3883928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4119270443916321,
+      "step": 2671
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1343.0,
+      "completions/max_terminated_length": 1343.0,
+      "completions/mean_length": 509.9285888671875,
+      "completions/mean_terminated_length": 509.9285888671875,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.757286561774568,
+      "grad_norm": 0.600599467754364,
+      "kl": 0.156982421875,
+      "learning_rate": 1e-06,
+      "loss": -0.006,
+      "num_tokens": 317104266.0,
+      "reward": 1.6607143878936768,
+      "reward_std": 0.16374391317367554,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6607142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.3964611887931824,
+      "step": 2672
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3150.0,
+      "completions/max_terminated_length": 3150.0,
+      "completions/mean_length": 566.7232666015625,
+      "completions/mean_terminated_length": 566.7232666015625,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 2.7583182873355687,
+      "grad_norm": 0.5870814919471741,
+      "kl": 0.158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0151,
+      "num_tokens": 317239895.0,
+      "reward": 1.4950894117355347,
+      "reward_std": 0.10298654437065125,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.43774399161338806,
+      "step": 2673
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3536.0,
+      "completions/max_terminated_length": 3536.0,
+      "completions/mean_length": 510.4464416503906,
+      "completions/mean_terminated_length": 510.4464416503906,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 2.7593500128965696,
+      "grad_norm": 0.5936353802680969,
+      "kl": 0.168701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0325,
+      "num_tokens": 317363628.0,
+      "reward": 1.5486608743667603,
+      "reward_std": 0.1814994066953659,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5486606955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.4584766924381256,
+      "step": 2674
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1354.0,
+      "completions/max_terminated_length": 1354.0,
+      "completions/mean_length": 546.7410888671875,
+      "completions/mean_terminated_length": 546.7410888671875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 2.7603817384575704,
+      "grad_norm": 0.641977071762085,
+      "kl": 0.1669921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0081,
+      "num_tokens": 317487359.0,
+      "reward": 1.5107144117355347,
+      "reward_std": 0.22570699453353882,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5107142329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.4437974989414215,
+      "step": 2675
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3354.0,
+      "completions/max_terminated_length": 3354.0,
+      "completions/mean_length": 653.9464721679688,
+      "completions/mean_terminated_length": 653.9464721679688,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "epoch": 2.7614134640185712,
+      "grad_norm": 0.5655720829963684,
+      "kl": 0.145263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0062,
+      "num_tokens": 317625925.0,
+      "reward": 1.5308035612106323,
+      "reward_std": 0.19269868731498718,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5308035612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.39990121126174927,
+      "step": 2676
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2861.0,
+      "completions/mean_length": 652.357177734375,
+      "completions/mean_terminated_length": 621.3333740234375,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.762445189579572,
+      "grad_norm": 0.5561245679855347,
+      "kl": 0.1455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0495,
+      "num_tokens": 317764264.0,
+      "reward": 1.4468750953674316,
+      "reward_std": 0.1468437761068344,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45580360293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.426368772983551,
+      "step": 2677
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3223.0,
+      "completions/max_terminated_length": 3223.0,
+      "completions/mean_length": 674.7142944335938,
+      "completions/mean_terminated_length": 674.7142944335938,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.7634769151405725,
+      "grad_norm": 0.5874546766281128,
+      "kl": 0.14306640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0474,
+      "num_tokens": 317916468.0,
+      "reward": 1.4196429252624512,
+      "reward_std": 0.2191556692123413,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4196428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.40953484177589417,
+      "step": 2678
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2168.0,
+      "completions/max_terminated_length": 2168.0,
+      "completions/mean_length": 636.607177734375,
+      "completions/mean_terminated_length": 636.607177734375,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 2.7645086407015733,
+      "grad_norm": 0.5092126727104187,
+      "kl": 0.14453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0089,
+      "num_tokens": 318062864.0,
+      "reward": 1.472321629524231,
+      "reward_std": 0.17760160565376282,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47232145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.42292627692222595,
+      "step": 2679
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1744.0,
+      "completions/max_terminated_length": 1744.0,
+      "completions/mean_length": 626.794677734375,
+      "completions/mean_terminated_length": 626.794677734375,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 2.765540366262574,
+      "grad_norm": 0.5636436939239502,
+      "kl": 0.13720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0336,
+      "num_tokens": 318204094.0,
+      "reward": 1.411607265472412,
+      "reward_std": 0.20242133736610413,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41160711646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.4180058538913727,
+      "step": 2680
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2572.0,
+      "completions/max_terminated_length": 2572.0,
+      "completions/mean_length": 590.5,
+      "completions/mean_terminated_length": 590.5,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 2.766572091823575,
+      "grad_norm": 0.6176509857177734,
+      "kl": 0.159912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0179,
+      "num_tokens": 318331025.0,
+      "reward": 1.4200893640518188,
+      "reward_std": 0.17646051943302155,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4200892746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.4145691394805908,
+      "step": 2681
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2171.0,
+      "completions/mean_length": 650.919677734375,
+      "completions/mean_terminated_length": 619.8828735351562,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.767603817384576,
+      "grad_norm": 0.5832086205482483,
+      "kl": 0.15283203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0233,
+      "num_tokens": 318476595.0,
+      "reward": 1.602678656578064,
+      "reward_std": 0.18887139856815338,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6026785969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.4144267439842224,
+      "step": 2682
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1726.0,
+      "completions/max_terminated_length": 1726.0,
+      "completions/mean_length": 543.0267944335938,
+      "completions/mean_terminated_length": 543.0267944335938,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 2.7686355429455762,
+      "grad_norm": 0.6461904644966125,
+      "kl": 0.159423828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0094,
+      "num_tokens": 318607561.0,
+      "reward": 1.472767949104309,
+      "reward_std": 0.24298816919326782,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48169639706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.3800187110900879,
+      "step": 2683
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2606.0,
+      "completions/mean_length": 685.482177734375,
+      "completions/mean_terminated_length": 654.7567749023438,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 2.769667268506577,
+      "grad_norm": 0.5030291080474854,
+      "kl": 0.134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0177,
+      "num_tokens": 318751794.0,
+      "reward": 1.432142972946167,
+      "reward_std": 0.18000416457653046,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44107145071029663,
+      "rewards/curriculum_aware_reward_fn/std": 0.419148325920105,
+      "step": 2684
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3369.0,
+      "completions/max_terminated_length": 3369.0,
+      "completions/mean_length": 612.4375,
+      "completions/mean_terminated_length": 612.4375,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.770698994067578,
+      "grad_norm": 0.5694253444671631,
+      "kl": 0.154541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0061,
+      "num_tokens": 318899718.0,
+      "reward": 1.4370537996292114,
+      "reward_std": 0.15659162402153015,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4370535910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.429893434047699,
+      "step": 2685
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2680.0,
+      "completions/max_terminated_length": 2680.0,
+      "completions/mean_length": 691.3214721679688,
+      "completions/mean_terminated_length": 691.3214721679688,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.7717307196285788,
+      "grad_norm": 0.5326395034790039,
+      "kl": 0.139892578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0154,
+      "num_tokens": 319038200.0,
+      "reward": 1.462053656578064,
+      "reward_std": 0.19021055102348328,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4709821343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4151759743690491,
+      "step": 2686
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2392.0,
+      "completions/max_terminated_length": 2392.0,
+      "completions/mean_length": 689.4642944335938,
+      "completions/mean_terminated_length": 689.4642944335938,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 2.7727624451895796,
+      "grad_norm": 0.5327898859977722,
+      "kl": 0.1422119140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0102,
+      "num_tokens": 319175487.0,
+      "reward": 1.5566965341567993,
+      "reward_std": 0.17435990273952484,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5566964149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.4176023006439209,
+      "step": 2687
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1527.0,
+      "completions/max_terminated_length": 1527.0,
+      "completions/mean_length": 625.7767944335938,
+      "completions/mean_terminated_length": 625.7767944335938,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.7737941707505804,
+      "grad_norm": 0.5624753832817078,
+      "kl": 0.151611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0162,
+      "num_tokens": 319319620.0,
+      "reward": 1.5593750476837158,
+      "reward_std": 0.2108374536037445,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5683035254478455,
+      "rewards/curriculum_aware_reward_fn/std": 0.4098997414112091,
+      "step": 2688
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2804.0,
+      "completions/mean_length": 709.7767944335938,
+      "completions/mean_terminated_length": 648.2090454101562,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.7748258963115813,
+      "grad_norm": 0.4340974986553192,
+      "kl": 0.142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0671,
+      "num_tokens": 319458118.0,
+      "reward": 1.4397321939468384,
+      "reward_std": 0.15324297547340393,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4575892984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.4162904620170593,
+      "step": 2689
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2372.0,
+      "completions/max_terminated_length": 2372.0,
+      "completions/mean_length": 643.1339721679688,
+      "completions/mean_terminated_length": 643.1339721679688,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 2.775857621872582,
+      "grad_norm": 0.49220940470695496,
+      "kl": 0.14306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 319593765.0,
+      "reward": 1.5763393640518188,
+      "reward_std": 0.14425250887870789,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5763392448425293,
+      "rewards/curriculum_aware_reward_fn/std": 0.40196365118026733,
+      "step": 2690
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2623.0,
+      "completions/max_terminated_length": 2623.0,
+      "completions/mean_length": 591.482177734375,
+      "completions/mean_terminated_length": 591.482177734375,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 2.7768893474335825,
+      "grad_norm": 0.6150352954864502,
+      "kl": 0.176513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0309,
+      "num_tokens": 319723004.0,
+      "reward": 1.5531251430511475,
+      "reward_std": 0.21724960207939148,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5620535612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.42214837670326233,
+      "step": 2691
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1337.0,
+      "completions/max_terminated_length": 1337.0,
+      "completions/mean_length": 640.1160888671875,
+      "completions/mean_terminated_length": 640.1160888671875,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 2.7779210729945834,
+      "grad_norm": 0.47986412048339844,
+      "kl": 0.14599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0389,
+      "num_tokens": 319856458.0,
+      "reward": 1.4339287281036377,
+      "reward_std": 0.2294609099626541,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45178571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.46975788474082947,
+      "step": 2692
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1712.0,
+      "completions/mean_length": 592.794677734375,
+      "completions/mean_terminated_length": 561.2342529296875,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.778952798555584,
+      "grad_norm": 0.48358824849128723,
+      "kl": 0.15869140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0188,
+      "num_tokens": 319982295.0,
+      "reward": 1.571428656578064,
+      "reward_std": 0.12172428518533707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5714285969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.4192289412021637,
+      "step": 2693
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3854.0,
+      "completions/max_terminated_length": 3854.0,
+      "completions/mean_length": 612.732177734375,
+      "completions/mean_terminated_length": 612.732177734375,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 2.779984524116585,
+      "grad_norm": 0.5338239669799805,
+      "kl": 0.14404296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0022,
+      "num_tokens": 320106718.0,
+      "reward": 1.6000001430511475,
+      "reward_std": 0.1714448630809784,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.4356314539909363,
+      "step": 2694
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1616.0,
+      "completions/max_terminated_length": 1616.0,
+      "completions/mean_length": 661.9285888671875,
+      "completions/mean_terminated_length": 661.9285888671875,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 2.781016249677586,
+      "grad_norm": 0.5156787037849426,
+      "kl": 0.152099609375,
+      "learning_rate": 1e-06,
+      "loss": -0.02,
+      "num_tokens": 320257147.0,
+      "reward": 1.6071429252624512,
+      "reward_std": 0.18883086740970612,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6071428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4365020990371704,
+      "step": 2695
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3227.0,
+      "completions/mean_length": 698.857177734375,
+      "completions/mean_terminated_length": 668.2522583007812,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 2.7820479752385863,
+      "grad_norm": 0.3876507878303528,
+      "kl": 0.1279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.017,
+      "num_tokens": 320398419.0,
+      "reward": 1.4830358028411865,
+      "reward_std": 0.167936772108078,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5008928179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.4581092298030853,
+      "step": 2696
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2221.0,
+      "completions/max_terminated_length": 2221.0,
+      "completions/mean_length": 693.8928833007812,
+      "completions/mean_terminated_length": 693.8928833007812,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 2.783079700799587,
+      "grad_norm": 0.5896779298782349,
+      "kl": 0.137939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0198,
+      "num_tokens": 320542614.0,
+      "reward": 1.4705358743667603,
+      "reward_std": 0.2531685531139374,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4794643223285675,
+      "rewards/curriculum_aware_reward_fn/std": 0.4202934205532074,
+      "step": 2697
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2356.0,
+      "completions/mean_length": 774.2142944335938,
+      "completions/mean_terminated_length": 713.8181762695312,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 2.784111426360588,
+      "grad_norm": 0.4785737693309784,
+      "kl": 0.122314453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0034,
+      "num_tokens": 320696736.0,
+      "reward": 1.4147323369979858,
+      "reward_std": 0.21173430979251862,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4236606955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.43407562375068665,
+      "step": 2698
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2713.0,
+      "completions/mean_length": 766.0625610351562,
+      "completions/mean_terminated_length": 736.0631103515625,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 2.785143151921589,
+      "grad_norm": 0.5446650385856628,
+      "kl": 0.1336669921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 320853571.0,
+      "reward": 1.4437501430511475,
+      "reward_std": 0.16912689805030823,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4437499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.4204847514629364,
+      "step": 2699
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1931.0,
+      "completions/max_terminated_length": 1931.0,
+      "completions/mean_length": 634.2053833007812,
+      "completions/mean_terminated_length": 634.2053833007812,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.7861748774825896,
+      "grad_norm": 0.5768882632255554,
+      "kl": 0.140380859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0006,
+      "num_tokens": 320986972.0,
+      "reward": 1.5616072416305542,
+      "reward_std": 0.2245289385318756,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5616071820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.44311460852622986,
+      "step": 2700
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1731.0,
+      "completions/mean_length": 669.357177734375,
+      "completions/mean_terminated_length": 638.4865112304688,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 2.7872066030435905,
+      "grad_norm": 0.5173462629318237,
+      "kl": 0.1390380859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0049,
+      "num_tokens": 321138751.0,
+      "reward": 1.602678656578064,
+      "reward_std": 0.1530117690563202,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6116071343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4148690402507782,
+      "step": 2701
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2842.0,
+      "completions/max_terminated_length": 2842.0,
+      "completions/mean_length": 564.1964721679688,
+      "completions/mean_terminated_length": 564.1964721679688,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 2.7882383286045913,
+      "grad_norm": 0.5467989444732666,
+      "kl": 0.17529296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 321256849.0,
+      "reward": 1.5928571224212646,
+      "reward_std": 0.1943163275718689,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5928570628166199,
+      "rewards/curriculum_aware_reward_fn/std": 0.41578415036201477,
+      "step": 2702
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2809.0,
+      "completions/mean_length": 691.607177734375,
+      "completions/mean_terminated_length": 629.7090454101562,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 2.789270054165592,
+      "grad_norm": 0.5662619471549988,
+      "kl": 0.1351318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0591,
+      "num_tokens": 321401355.0,
+      "reward": 1.3928571939468384,
+      "reward_std": 0.18446427583694458,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3928571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4366568922996521,
+      "step": 2703
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2381.0,
+      "completions/mean_length": 690.7857666015625,
+      "completions/mean_terminated_length": 660.108154296875,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.790301779726593,
+      "grad_norm": 0.5307469964027405,
+      "kl": 0.128662109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0268,
+      "num_tokens": 321552916.0,
+      "reward": 1.419196605682373,
+      "reward_std": 0.1603444665670395,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41919639706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.43605756759643555,
+      "step": 2704
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3711.0,
+      "completions/mean_length": 635.1875,
+      "completions/mean_terminated_length": 604.009033203125,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 2.7913335052875934,
+      "grad_norm": 0.48673635721206665,
+      "kl": 0.153564453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0467,
+      "num_tokens": 321689850.0,
+      "reward": 1.6089287996292114,
+      "reward_std": 0.11687122285366058,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6178570985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.41551318764686584,
+      "step": 2705
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2425.0,
+      "completions/mean_length": 666.9375,
+      "completions/mean_terminated_length": 636.0450439453125,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.7923652308485942,
+      "grad_norm": 0.5421416163444519,
+      "kl": 0.1376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0298,
+      "num_tokens": 321830536.0,
+      "reward": 1.5468751192092896,
+      "reward_std": 0.17485347390174866,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.546875,
+      "rewards/curriculum_aware_reward_fn/std": 0.4419082999229431,
+      "step": 2706
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2176.0,
+      "completions/mean_length": 717.7589721679688,
+      "completions/mean_terminated_length": 687.3243408203125,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.793396956409595,
+      "grad_norm": 0.5420852303504944,
+      "kl": 0.138671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0489,
+      "num_tokens": 321981282.0,
+      "reward": 1.5647321939468384,
+      "reward_std": 0.24127903580665588,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5647321939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3975509703159332,
+      "step": 2707
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1874.0,
+      "completions/max_terminated_length": 1874.0,
+      "completions/mean_length": 603.0,
+      "completions/mean_terminated_length": 603.0,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.794428681970596,
+      "grad_norm": 0.47553688287734985,
+      "kl": 0.143310546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0418,
+      "num_tokens": 322115976.0,
+      "reward": 1.4941965341567993,
+      "reward_std": 0.1817176789045334,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49419641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.46074485778808594,
+      "step": 2708
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2807.0,
+      "completions/max_terminated_length": 2807.0,
+      "completions/mean_length": 632.1339721679688,
+      "completions/mean_terminated_length": 632.1339721679688,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 2.7954604075315967,
+      "grad_norm": 0.6050577163696289,
+      "kl": 0.144775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0226,
+      "num_tokens": 322257377.0,
+      "reward": 1.5830357074737549,
+      "reward_std": 0.21893826127052307,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5830357670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.41765937209129333,
+      "step": 2709
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2743.0,
+      "completions/max_terminated_length": 2743.0,
+      "completions/mean_length": 645.8928833007812,
+      "completions/mean_terminated_length": 645.8928833007812,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.796492133092597,
+      "grad_norm": 0.6692031025886536,
+      "kl": 0.175537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0363,
+      "num_tokens": 322396546.0,
+      "reward": 1.5839285850524902,
+      "reward_std": 0.2636502683162689,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5839285850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4007433354854584,
+      "step": 2710
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2234.0,
+      "completions/mean_length": 783.8928833007812,
+      "completions/mean_terminated_length": 754.0540771484375,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "epoch": 2.797523858653598,
+      "grad_norm": 0.5401923656463623,
+      "kl": 0.144287109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0312,
+      "num_tokens": 322557464.0,
+      "reward": 1.485267996788025,
+      "reward_std": 0.230984166264534,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48526784777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.40673598647117615,
+      "step": 2711
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2548.0,
+      "completions/mean_length": 667.419677734375,
+      "completions/mean_terminated_length": 605.081787109375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.798555584214599,
+      "grad_norm": 0.46403664350509644,
+      "kl": 0.1494140625,
+      "learning_rate": 1e-06,
+      "loss": 0.067,
+      "num_tokens": 322699170.0,
+      "reward": 1.650892972946167,
+      "reward_std": 0.172529399394989,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6598214507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.4270598590373993,
+      "step": 2712
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3091.0,
+      "completions/mean_length": 759.669677734375,
+      "completions/mean_terminated_length": 729.6126098632812,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "epoch": 2.7995873097755997,
+      "grad_norm": 0.5070902109146118,
+      "kl": 0.126708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0309,
+      "num_tokens": 322850298.0,
+      "reward": 1.4946428537368774,
+      "reward_std": 0.20876778662204742,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49464282393455505,
+      "rewards/curriculum_aware_reward_fn/std": 0.4294012784957886,
+      "step": 2713
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2500.0,
+      "completions/max_terminated_length": 2500.0,
+      "completions/mean_length": 625.5267944335938,
+      "completions/mean_terminated_length": 625.5267944335938,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 2.8006190353366005,
+      "grad_norm": 0.4674634337425232,
+      "kl": 0.15185546875,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 322989685.0,
+      "reward": 1.5687501430511475,
+      "reward_std": 0.2000505030155182,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5687500238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.40576502680778503,
+      "step": 2714
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2498.0,
+      "completions/max_terminated_length": 2498.0,
+      "completions/mean_length": 591.125,
+      "completions/mean_terminated_length": 591.125,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 2.8016507608976013,
+      "grad_norm": 0.509829580783844,
+      "kl": 0.14306640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0437,
+      "num_tokens": 323110288.0,
+      "reward": 1.641964316368103,
+      "reward_std": 0.15762221813201904,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.641964316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.4348321259021759,
+      "step": 2715
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2325.0,
+      "completions/max_terminated_length": 2325.0,
+      "completions/mean_length": 638.8303833007812,
+      "completions/mean_terminated_length": 638.8303833007812,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.802682486458602,
+      "grad_norm": 0.4947524964809418,
+      "kl": 0.1376953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0071,
+      "num_tokens": 323245845.0,
+      "reward": 1.6642857789993286,
+      "reward_std": 0.23866406083106995,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6642857193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.42524126172065735,
+      "step": 2716
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1779.0,
+      "completions/mean_length": 684.0625610351562,
+      "completions/mean_terminated_length": 622.0272827148438,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 2.803714212019603,
+      "grad_norm": 0.5975044369697571,
+      "kl": 0.150634765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0169,
+      "num_tokens": 323388323.0,
+      "reward": 1.5200893878936768,
+      "reward_std": 0.1250755488872528,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5200892686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.41948381066322327,
+      "step": 2717
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3763.0,
+      "completions/max_terminated_length": 3763.0,
+      "completions/mean_length": 691.9553833007812,
+      "completions/mean_terminated_length": 691.9553833007812,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.8047459375806034,
+      "grad_norm": 0.5362421274185181,
+      "kl": 0.145751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0154,
+      "num_tokens": 323533204.0,
+      "reward": 1.5040180683135986,
+      "reward_std": 0.16446419060230255,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5040178298950195,
+      "rewards/curriculum_aware_reward_fn/std": 0.5325925946235657,
+      "step": 2718
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2365.0,
+      "completions/mean_length": 581.232177734375,
+      "completions/mean_terminated_length": 549.5675659179688,
+      "completions/min_length": 160.0,
+      "completions/min_terminated_length": 160.0,
+      "epoch": 2.8057776631416043,
+      "grad_norm": 0.6170505881309509,
+      "kl": 0.140869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0415,
+      "num_tokens": 323664172.0,
+      "reward": 1.7236608266830444,
+      "reward_std": 0.10211389511823654,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7236607670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.39392685890197754,
+      "step": 2719
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1792.0,
+      "completions/max_terminated_length": 1792.0,
+      "completions/mean_length": 621.0625,
+      "completions/mean_terminated_length": 621.0625,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 2.806809388702605,
+      "grad_norm": 0.5479264259338379,
+      "kl": 0.14306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 323794162.0,
+      "reward": 1.52723228931427,
+      "reward_std": 0.2021634876728058,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5272321105003357,
+      "rewards/curriculum_aware_reward_fn/std": 0.4500194489955902,
+      "step": 2720
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1790.0,
+      "completions/mean_length": 690.6160888671875,
+      "completions/mean_terminated_length": 659.9369506835938,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.807841114263606,
+      "grad_norm": 0.6118156313896179,
+      "kl": 0.138916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 323944052.0,
+      "reward": 1.6593750715255737,
+      "reward_std": 0.1973244845867157,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.659375011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.3662281334400177,
+      "step": 2721
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2712.0,
+      "completions/max_terminated_length": 2712.0,
+      "completions/mean_length": 690.0000610351562,
+      "completions/mean_terminated_length": 690.0000610351562,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.808872839824607,
+      "grad_norm": 0.6139779090881348,
+      "kl": 0.12939453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 324089911.0,
+      "reward": 1.5500000715255737,
+      "reward_std": 0.2188665270805359,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.550000011920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.40509146451950073,
+      "step": 2722
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1743.0,
+      "completions/max_terminated_length": 1743.0,
+      "completions/mean_length": 628.9107666015625,
+      "completions/mean_terminated_length": 628.9107666015625,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 2.809904565385607,
+      "grad_norm": 0.5854537487030029,
+      "kl": 0.143798828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0433,
+      "num_tokens": 324235592.0,
+      "reward": 1.5468751192092896,
+      "reward_std": 0.190732941031456,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.546875,
+      "rewards/curriculum_aware_reward_fn/std": 0.41352590918540955,
+      "step": 2723
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1709.0,
+      "completions/mean_length": 649.294677734375,
+      "completions/mean_terminated_length": 618.2432861328125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.810936290946608,
+      "grad_norm": 0.45837923884391785,
+      "kl": 0.123779296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0064,
+      "num_tokens": 324377760.0,
+      "reward": 1.4794644117355347,
+      "reward_std": 0.1530054211616516,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4794642925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.5389159321784973,
+      "step": 2724
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1929.0,
+      "completions/max_terminated_length": 1929.0,
+      "completions/mean_length": 619.857177734375,
+      "completions/mean_terminated_length": 619.857177734375,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 2.811968016507609,
+      "grad_norm": 0.617780327796936,
+      "kl": 0.1292724609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0078,
+      "num_tokens": 324514489.0,
+      "reward": 1.368749976158142,
+      "reward_std": 0.15938755869865417,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36875003576278687,
+      "rewards/curriculum_aware_reward_fn/std": 0.39536306262016296,
+      "step": 2725
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2127.0,
+      "completions/max_terminated_length": 2127.0,
+      "completions/mean_length": 636.9464721679688,
+      "completions/mean_terminated_length": 636.9464721679688,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 2.8129997420686097,
+      "grad_norm": 0.5487395524978638,
+      "kl": 0.1234130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0507,
+      "num_tokens": 324657129.0,
+      "reward": 1.4736608266830444,
+      "reward_std": 0.20129023492336273,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4736607074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.4590412974357605,
+      "step": 2726
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1239.0,
+      "completions/max_terminated_length": 1239.0,
+      "completions/mean_length": 542.1964721679688,
+      "completions/mean_terminated_length": 542.1964721679688,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 2.8140314676296105,
+      "grad_norm": 0.5869016647338867,
+      "kl": 0.1339111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0216,
+      "num_tokens": 324778412.0,
+      "reward": 1.614285945892334,
+      "reward_std": 0.2042093425989151,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6142857670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.4096487760543823,
+      "step": 2727
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3697.0,
+      "completions/mean_length": 686.5892944335938,
+      "completions/mean_terminated_length": 655.8739013671875,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 2.8150631931906114,
+      "grad_norm": 0.5226927995681763,
+      "kl": 0.1175537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0266,
+      "num_tokens": 324929510.0,
+      "reward": 1.5303571224212646,
+      "reward_std": 0.15417543053627014,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5303571820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.44435179233551025,
+      "step": 2728
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2069.0,
+      "completions/max_terminated_length": 2069.0,
+      "completions/mean_length": 589.0267944335938,
+      "completions/mean_terminated_length": 589.0267944335938,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 2.816094918751612,
+      "grad_norm": 0.7116428017616272,
+      "kl": 0.127197265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0147,
+      "num_tokens": 325067392.0,
+      "reward": 1.4544644355773926,
+      "reward_std": 0.2100774347782135,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45446428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.40238887071609497,
+      "step": 2729
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2507.0,
+      "completions/max_terminated_length": 2507.0,
+      "completions/mean_length": 517.875,
+      "completions/mean_terminated_length": 517.875,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 2.817126644312613,
+      "grad_norm": 0.7094348073005676,
+      "kl": 0.14013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0525,
+      "num_tokens": 325193392.0,
+      "reward": 1.706696629524231,
+      "reward_std": 0.24004627764225006,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7066963911056519,
+      "rewards/curriculum_aware_reward_fn/std": 0.35083526372909546,
+      "step": 2730
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1381.0,
+      "completions/max_terminated_length": 1381.0,
+      "completions/mean_length": 482.6250305175781,
+      "completions/mean_terminated_length": 482.6250305175781,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 2.8181583698736135,
+      "grad_norm": 0.6465932726860046,
+      "kl": 0.1090087890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0295,
+      "num_tokens": 325310677.0,
+      "reward": 1.5303571224212646,
+      "reward_std": 0.20120106637477875,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5303571820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.433680921792984,
+      "step": 2731
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2721.0,
+      "completions/max_terminated_length": 2721.0,
+      "completions/mean_length": 642.7589721679688,
+      "completions/mean_terminated_length": 642.7589721679688,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 2.8191900954346143,
+      "grad_norm": 0.6267445683479309,
+      "kl": 0.1058349609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0034,
+      "num_tokens": 325443650.0,
+      "reward": 1.4102680683135986,
+      "reward_std": 0.2142600417137146,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4102678596973419,
+      "rewards/curriculum_aware_reward_fn/std": 0.3987106680870056,
+      "step": 2732
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1508.0,
+      "completions/max_terminated_length": 1508.0,
+      "completions/mean_length": 583.6964721679688,
+      "completions/mean_terminated_length": 583.6964721679688,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.820221820995615,
+      "grad_norm": 0.6053333282470703,
+      "kl": 0.1295166015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0253,
+      "num_tokens": 325575566.0,
+      "reward": 1.549553632736206,
+      "reward_std": 0.23137836158275604,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5495535731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.401881605386734,
+      "step": 2733
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2475.0,
+      "completions/max_terminated_length": 2475.0,
+      "completions/mean_length": 519.1875,
+      "completions/mean_terminated_length": 519.1875,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 2.821253546556616,
+      "grad_norm": 0.6255543231964111,
+      "kl": 0.1456298828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0323,
+      "num_tokens": 325691416.0,
+      "reward": 1.6580358743667603,
+      "reward_std": 0.20052222907543182,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6580356955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.4227512776851654,
+      "step": 2734
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3016.0,
+      "completions/max_terminated_length": 3016.0,
+      "completions/mean_length": 611.3482666015625,
+      "completions/mean_terminated_length": 611.3482666015625,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.822285272117617,
+      "grad_norm": 0.6658999919891357,
+      "kl": 0.1302490234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0274,
+      "num_tokens": 325825764.0,
+      "reward": 1.5982145071029663,
+      "reward_std": 0.1547519564628601,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5982142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.40503188967704773,
+      "step": 2735
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1929.0,
+      "completions/max_terminated_length": 1929.0,
+      "completions/mean_length": 587.8660888671875,
+      "completions/mean_terminated_length": 587.8660888671875,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 2.823316997678617,
+      "grad_norm": 0.6943725943565369,
+      "kl": 0.133544921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0474,
+      "num_tokens": 325953247.0,
+      "reward": 1.4250000715255737,
+      "reward_std": 0.24267423152923584,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.38467252254486084,
+      "step": 2736
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3335.0,
+      "completions/max_terminated_length": 3335.0,
+      "completions/mean_length": 592.232177734375,
+      "completions/mean_terminated_length": 592.232177734375,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.824348723239618,
+      "grad_norm": 0.6189416646957397,
+      "kl": 0.1094970703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0577,
+      "num_tokens": 326078839.0,
+      "reward": 1.5406252145767212,
+      "reward_std": 0.1452028453350067,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5406249761581421,
+      "rewards/curriculum_aware_reward_fn/std": 0.42982980608940125,
+      "step": 2737
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3383.0,
+      "completions/max_terminated_length": 3383.0,
+      "completions/mean_length": 761.9375610351562,
+      "completions/mean_terminated_length": 761.9375610351562,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.825380448800619,
+      "grad_norm": 0.5241042375564575,
+      "kl": 0.1085205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0348,
+      "num_tokens": 326234085.0,
+      "reward": 1.5888394117355347,
+      "reward_std": 0.21741487085819244,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5888392329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.4007149934768677,
+      "step": 2738
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1701.0,
+      "completions/mean_length": 574.2767944335938,
+      "completions/mean_terminated_length": 542.549560546875,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 2.8264121743616197,
+      "grad_norm": 0.6535649299621582,
+      "kl": 0.12548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0078,
+      "num_tokens": 326371708.0,
+      "reward": 1.5611608028411865,
+      "reward_std": 0.1966494917869568,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5700892806053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.42066332697868347,
+      "step": 2739
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1909.0,
+      "completions/max_terminated_length": 1909.0,
+      "completions/mean_length": 550.9642944335938,
+      "completions/mean_terminated_length": 550.9642944335938,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 2.8274438999226206,
+      "grad_norm": 0.599125325679779,
+      "kl": 0.1212158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0339,
+      "num_tokens": 326502156.0,
+      "reward": 1.4745535850524902,
+      "reward_std": 0.17932043969631195,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47455355525016785,
+      "rewards/curriculum_aware_reward_fn/std": 0.436251163482666,
+      "step": 2740
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2128.0,
+      "completions/mean_length": 633.0267944335938,
+      "completions/mean_terminated_length": 601.828857421875,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 2.8284756254836214,
+      "grad_norm": 0.6590193510055542,
+      "kl": 0.14404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0427,
+      "num_tokens": 326646660.0,
+      "reward": 1.5750000476837158,
+      "reward_std": 0.12754014134407043,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5839285254478455,
+      "rewards/curriculum_aware_reward_fn/std": 0.4407821595668793,
+      "step": 2741
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2685.0,
+      "completions/max_terminated_length": 2685.0,
+      "completions/mean_length": 577.6607666015625,
+      "completions/mean_terminated_length": 577.6607666015625,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 2.8295073510446223,
+      "grad_norm": 0.5379606485366821,
+      "kl": 0.118896484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0226,
+      "num_tokens": 326771250.0,
+      "reward": 1.4455357789993286,
+      "reward_std": 0.13722234964370728,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.44935157895088196,
+      "step": 2742
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1714.0,
+      "completions/max_terminated_length": 1714.0,
+      "completions/mean_length": 644.419677734375,
+      "completions/mean_terminated_length": 644.419677734375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 2.830539076605623,
+      "grad_norm": 0.5652866959571838,
+      "kl": 0.1173095703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0139,
+      "num_tokens": 326917751.0,
+      "reward": 1.4107143878936768,
+      "reward_std": 0.21303237974643707,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3831724226474762,
+      "step": 2743
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2476.0,
+      "completions/max_terminated_length": 2476.0,
+      "completions/mean_length": 647.4642944335938,
+      "completions/mean_terminated_length": 647.4642944335938,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 2.831570802166624,
+      "grad_norm": 0.5718545317649841,
+      "kl": 0.1138916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0321,
+      "num_tokens": 327065461.0,
+      "reward": 1.4526787996292114,
+      "reward_std": 0.25439465045928955,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45267853140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.43443238735198975,
+      "step": 2744
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1646.0,
+      "completions/mean_length": 669.2410888671875,
+      "completions/mean_terminated_length": 606.9363403320312,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 2.8326025277276243,
+      "grad_norm": 0.6078090667724609,
+      "kl": 0.1171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0352,
+      "num_tokens": 327200042.0,
+      "reward": 1.380357265472412,
+      "reward_std": 0.20888622105121613,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38928571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.3939536511898041,
+      "step": 2745
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1619.0,
+      "completions/max_terminated_length": 1619.0,
+      "completions/mean_length": 590.4285888671875,
+      "completions/mean_terminated_length": 590.4285888671875,
+      "completions/min_length": 267.0,
+      "completions/min_terminated_length": 267.0,
+      "epoch": 2.833634253288625,
+      "grad_norm": 0.5378431081771851,
+      "kl": 0.120361328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0286,
+      "num_tokens": 327332831.0,
+      "reward": 1.627232313156128,
+      "reward_std": 0.18446555733680725,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6272321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.38237571716308594,
+      "step": 2746
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2641.0,
+      "completions/max_terminated_length": 2641.0,
+      "completions/mean_length": 603.3482666015625,
+      "completions/mean_terminated_length": 603.3482666015625,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 2.834665978849626,
+      "grad_norm": 0.6392841935157776,
+      "kl": 0.1219482421875,
+      "learning_rate": 1e-06,
+      "loss": -0.0078,
+      "num_tokens": 327466857.0,
+      "reward": 1.5977680683135986,
+      "reward_std": 0.24501563608646393,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5977678298950195,
+      "rewards/curriculum_aware_reward_fn/std": 0.39044836163520813,
+      "step": 2747
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2207.0,
+      "completions/max_terminated_length": 2207.0,
+      "completions/mean_length": 645.875,
+      "completions/mean_terminated_length": 645.875,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 2.835697704410627,
+      "grad_norm": 0.533803403377533,
+      "kl": 0.125244140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0215,
+      "num_tokens": 327609580.0,
+      "reward": 1.5861607789993286,
+      "reward_std": 0.13563905656337738,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.4360557198524475,
+      "step": 2748
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2596.0,
+      "completions/max_terminated_length": 2596.0,
+      "completions/mean_length": 657.2589721679688,
+      "completions/mean_terminated_length": 657.2589721679688,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 2.8367294299716277,
+      "grad_norm": 0.5073429942131042,
+      "kl": 0.1175537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0134,
+      "num_tokens": 327750904.0,
+      "reward": 1.4535715579986572,
+      "reward_std": 0.13991807401180267,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4535714089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.44024530053138733,
+      "step": 2749
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1730.0,
+      "completions/max_terminated_length": 1730.0,
+      "completions/mean_length": 606.3482666015625,
+      "completions/mean_terminated_length": 606.3482666015625,
+      "completions/min_length": 177.0,
+      "completions/min_terminated_length": 177.0,
+      "epoch": 2.837761155532628,
+      "grad_norm": 0.6113404035568237,
+      "kl": 0.11865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0395,
+      "num_tokens": 327879911.0,
+      "reward": 1.4343750476837158,
+      "reward_std": 0.21782106161117554,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43437501788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.41220298409461975,
+      "step": 2750
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1860.0,
+      "completions/max_terminated_length": 1860.0,
+      "completions/mean_length": 582.669677734375,
+      "completions/mean_terminated_length": 582.669677734375,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.838792881093629,
+      "grad_norm": 0.6139469146728516,
+      "kl": 0.1182861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0107,
+      "num_tokens": 328012209.0,
+      "reward": 1.520535945892334,
+      "reward_std": 0.25029444694519043,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5205357670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.4264642596244812,
+      "step": 2751
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2621.0,
+      "completions/max_terminated_length": 2621.0,
+      "completions/mean_length": 575.5803833007812,
+      "completions/mean_terminated_length": 575.5803833007812,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 2.8398246066546298,
+      "grad_norm": 0.6015924215316772,
+      "kl": 0.1124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0346,
+      "num_tokens": 328144476.0,
+      "reward": 1.5433037281036377,
+      "reward_std": 0.198805034160614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5433035492897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.406728059053421,
+      "step": 2752
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2968.0,
+      "completions/mean_length": 690.5089721679688,
+      "completions/mean_terminated_length": 659.828857421875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 2.8408563322156306,
+      "grad_norm": 0.6319563984870911,
+      "kl": 0.1114501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 328289587.0,
+      "reward": 1.383928656578064,
+      "reward_std": 0.24402377009391785,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4017857015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.4391293525695801,
+      "step": 2753
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1251.0,
+      "completions/max_terminated_length": 1251.0,
+      "completions/mean_length": 524.125,
+      "completions/mean_terminated_length": 524.125,
+      "completions/min_length": 179.0,
+      "completions/min_terminated_length": 179.0,
+      "epoch": 2.8418880577766314,
+      "grad_norm": 0.5966294407844543,
+      "kl": 0.1282958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0474,
+      "num_tokens": 328418450.0,
+      "reward": 1.462053656578064,
+      "reward_std": 0.16561885178089142,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4709821343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4223293662071228,
+      "step": 2754
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2748.0,
+      "completions/max_terminated_length": 2748.0,
+      "completions/mean_length": 646.3482666015625,
+      "completions/mean_terminated_length": 646.3482666015625,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.8429197833376323,
+      "grad_norm": 0.5699669122695923,
+      "kl": 0.1124267578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0129,
+      "num_tokens": 328553904.0,
+      "reward": 1.5258928537368774,
+      "reward_std": 0.23065383732318878,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5348213911056519,
+      "rewards/curriculum_aware_reward_fn/std": 0.43219736218452454,
+      "step": 2755
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1848.0,
+      "completions/max_terminated_length": 1848.0,
+      "completions/mean_length": 570.794677734375,
+      "completions/mean_terminated_length": 570.794677734375,
+      "completions/min_length": 173.0,
+      "completions/min_terminated_length": 173.0,
+      "epoch": 2.843951508898633,
+      "grad_norm": 0.5182858109474182,
+      "kl": 0.1142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0274,
+      "num_tokens": 328687413.0,
+      "reward": 1.6473214626312256,
+      "reward_std": 0.20871160924434662,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6473214030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.5464397072792053,
+      "step": 2756
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1819.0,
+      "completions/max_terminated_length": 1819.0,
+      "completions/mean_length": 706.3839721679688,
+      "completions/mean_terminated_length": 706.3839721679688,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 2.844983234459634,
+      "grad_norm": 0.5897096991539001,
+      "kl": 0.1068115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 328831369.0,
+      "reward": 1.379910945892334,
+      "reward_std": 0.20769324898719788,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3799107074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.3808158338069916,
+      "step": 2757
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3134.0,
+      "completions/mean_length": 649.4553833007812,
+      "completions/mean_terminated_length": 618.4053955078125,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 2.8460149600206344,
+      "grad_norm": 0.6046122312545776,
+      "kl": 0.1082763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0676,
+      "num_tokens": 328976761.0,
+      "reward": 1.5232144594192505,
+      "reward_std": 0.255319744348526,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5410714745521545,
+      "rewards/curriculum_aware_reward_fn/std": 0.44656920433044434,
+      "step": 2758
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1459.0,
+      "completions/mean_length": 615.5089721679688,
+      "completions/mean_terminated_length": 552.2272338867188,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.847046685581635,
+      "grad_norm": 0.6533037424087524,
+      "kl": 0.11572265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0589,
+      "num_tokens": 329112086.0,
+      "reward": 1.6267858743667603,
+      "reward_std": 0.24057099223136902,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6267856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.3920912444591522,
+      "step": 2759
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2279.0,
+      "completions/mean_length": 679.419677734375,
+      "completions/mean_terminated_length": 648.6396484375,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 2.848078411142636,
+      "grad_norm": 0.6277405619621277,
+      "kl": 0.1229248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0315,
+      "num_tokens": 329255815.0,
+      "reward": 1.4151787757873535,
+      "reward_std": 0.23310363292694092,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4151785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.4091762602329254,
+      "step": 2760
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3006.0,
+      "completions/max_terminated_length": 3006.0,
+      "completions/mean_length": 806.8750610351562,
+      "completions/mean_terminated_length": 806.8750610351562,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 2.849110136703637,
+      "grad_norm": 0.5804691910743713,
+      "kl": 0.1204833984375,
+      "learning_rate": 1e-06,
+      "loss": -0.008,
+      "num_tokens": 329422254.0,
+      "reward": 1.368303656578064,
+      "reward_std": 0.15276534855365753,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3683035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.3609294593334198,
+      "step": 2761
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2329.0,
+      "completions/max_terminated_length": 2329.0,
+      "completions/mean_length": 783.7857666015625,
+      "completions/mean_terminated_length": 783.7857666015625,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 2.8501418622646377,
+      "grad_norm": 0.5310266017913818,
+      "kl": 0.1234130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0319,
+      "num_tokens": 329580689.0,
+      "reward": 1.231696605682373,
+      "reward_std": 0.16519694030284882,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.24955357611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.34896859526634216,
+      "step": 2762
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3746.0,
+      "completions/max_terminated_length": 3746.0,
+      "completions/mean_length": 704.6964721679688,
+      "completions/mean_terminated_length": 704.6964721679688,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 2.851173587825638,
+      "grad_norm": 0.5648817420005798,
+      "kl": 0.1199951171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0221,
+      "num_tokens": 329719689.0,
+      "reward": 1.5000001192092896,
+      "reward_std": 0.23215486109256744,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5,
+      "rewards/curriculum_aware_reward_fn/std": 0.43137115240097046,
+      "step": 2763
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3582.0,
+      "completions/max_terminated_length": 3582.0,
+      "completions/mean_length": 713.9107666015625,
+      "completions/mean_terminated_length": 713.9107666015625,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "epoch": 2.852205313386639,
+      "grad_norm": 0.5491836667060852,
+      "kl": 0.117919921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0367,
+      "num_tokens": 329872860.0,
+      "reward": 1.566517949104309,
+      "reward_std": 0.15733234584331512,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5665178894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.40516865253448486,
+      "step": 2764
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2924.0,
+      "completions/max_terminated_length": 2924.0,
+      "completions/mean_length": 811.982177734375,
+      "completions/mean_terminated_length": 811.982177734375,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 2.85323703894764,
+      "grad_norm": 0.5808110237121582,
+      "kl": 0.1220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0553,
+      "num_tokens": 330039416.0,
+      "reward": 1.4968751668930054,
+      "reward_std": 0.24768134951591492,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5058035850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.42789793014526367,
+      "step": 2765
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3141.0,
+      "completions/max_terminated_length": 3141.0,
+      "completions/mean_length": 748.7232666015625,
+      "completions/mean_terminated_length": 748.7232666015625,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 2.8542687645086406,
+      "grad_norm": 0.4694342017173767,
+      "kl": 0.118408203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0046,
+      "num_tokens": 330197711.0,
+      "reward": 1.4142857789993286,
+      "reward_std": 0.12062786519527435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41428571939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.5386180281639099,
+      "step": 2766
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2854.0,
+      "completions/max_terminated_length": 2854.0,
+      "completions/mean_length": 840.2053833007812,
+      "completions/mean_terminated_length": 840.2053833007812,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.8553004900696415,
+      "grad_norm": 0.5480297803878784,
+      "kl": 0.1085205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0201,
+      "num_tokens": 330355913.0,
+      "reward": 1.4700894355773926,
+      "reward_std": 0.1603080928325653,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47901788353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.4383958876132965,
+      "step": 2767
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3313.0,
+      "completions/max_terminated_length": 3313.0,
+      "completions/mean_length": 571.232177734375,
+      "completions/mean_terminated_length": 571.232177734375,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 2.8563322156306423,
+      "grad_norm": 0.49730056524276733,
+      "kl": 0.1295166015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0083,
+      "num_tokens": 330470935.0,
+      "reward": 1.669196605682373,
+      "reward_std": 0.12415627390146255,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6691964268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.41395941376686096,
+      "step": 2768
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3258.0,
+      "completions/max_terminated_length": 3258.0,
+      "completions/mean_length": 697.1250610351562,
+      "completions/mean_terminated_length": 697.1250610351562,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 2.857363941191643,
+      "grad_norm": 0.5766168236732483,
+      "kl": 0.1219482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0916,
+      "num_tokens": 330613336.0,
+      "reward": 1.6540179252624512,
+      "reward_std": 0.22875763475894928,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6540178656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4130276143550873,
+      "step": 2769
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2671.0,
+      "completions/mean_length": 701.9464721679688,
+      "completions/mean_terminated_length": 671.369384765625,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 2.858395666752644,
+      "grad_norm": 0.5088793039321899,
+      "kl": 0.1124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0451,
+      "num_tokens": 330753596.0,
+      "reward": 1.5861607789993286,
+      "reward_std": 0.17370003461837769,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.5460014343261719,
+      "step": 2770
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2800.0,
+      "completions/max_terminated_length": 2800.0,
+      "completions/mean_length": 639.9285888671875,
+      "completions/mean_terminated_length": 639.9285888671875,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 2.859427392313645,
+      "grad_norm": 0.6735429763793945,
+      "kl": 0.1207275390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0056,
+      "num_tokens": 330890606.0,
+      "reward": 1.4941965341567993,
+      "reward_std": 0.1921509951353073,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49419641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.4499121606349945,
+      "step": 2771
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 4082.0,
+      "completions/max_terminated_length": 4082.0,
+      "completions/mean_length": 805.8928833007812,
+      "completions/mean_terminated_length": 805.8928833007812,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 2.8604591178746452,
+      "grad_norm": 0.5096302628517151,
+      "kl": 0.111328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0015,
+      "num_tokens": 331049949.0,
+      "reward": 1.5459821224212646,
+      "reward_std": 0.17727798223495483,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5549107193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.42964261770248413,
+      "step": 2772
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3661.0,
+      "completions/max_terminated_length": 3661.0,
+      "completions/mean_length": 619.107177734375,
+      "completions/mean_terminated_length": 619.107177734375,
+      "completions/min_length": 185.0,
+      "completions/min_terminated_length": 185.0,
+      "epoch": 2.861490843435646,
+      "grad_norm": 0.5546408891677856,
+      "kl": 0.125,
+      "learning_rate": 1e-06,
+      "loss": 0.0497,
+      "num_tokens": 331189877.0,
+      "reward": 1.5205358266830444,
+      "reward_std": 0.09648904204368591,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5205357670783997,
+      "rewards/curriculum_aware_reward_fn/std": 0.4294640123844147,
+      "step": 2773
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3947.0,
+      "completions/mean_length": 982.4732666015625,
+      "completions/mean_terminated_length": 954.4234619140625,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "epoch": 2.862522568996647,
+      "grad_norm": 0.5640942454338074,
+      "kl": 0.1187744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0017,
+      "num_tokens": 331377719.0,
+      "reward": 1.2647321224212646,
+      "reward_std": 0.21712327003479004,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.27366071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3806679844856262,
+      "step": 2774
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1928.0,
+      "completions/max_terminated_length": 1928.0,
+      "completions/mean_length": 547.6428833007812,
+      "completions/mean_terminated_length": 547.6428833007812,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.8635542945576478,
+      "grad_norm": 0.5652421116828918,
+      "kl": 0.122802734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0065,
+      "num_tokens": 331505114.0,
+      "reward": 1.849107265472412,
+      "reward_std": 0.19259557127952576,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.858035683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.3003416657447815,
+      "step": 2775
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3592.0,
+      "completions/mean_length": 880.3839721679688,
+      "completions/mean_terminated_length": 851.4144287109375,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "epoch": 2.8645860201186486,
+      "grad_norm": 0.5577312111854553,
+      "kl": 0.1126708984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0028,
+      "num_tokens": 331668782.0,
+      "reward": 1.4937502145767212,
+      "reward_std": 0.21671874821186066,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4937500059604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.4100167453289032,
+      "step": 2776
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2131.0,
+      "completions/max_terminated_length": 2131.0,
+      "completions/mean_length": 712.4285888671875,
+      "completions/mean_terminated_length": 712.4285888671875,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.865617745679649,
+      "grad_norm": 0.49200648069381714,
+      "kl": 0.12451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0173,
+      "num_tokens": 331810006.0,
+      "reward": 1.5785716772079468,
+      "reward_std": 0.16207048296928406,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5785713791847229,
+      "rewards/curriculum_aware_reward_fn/std": 0.4149087369441986,
+      "step": 2777
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2793.0,
+      "completions/max_terminated_length": 2793.0,
+      "completions/mean_length": 710.1517944335938,
+      "completions/mean_terminated_length": 710.1517944335938,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 2.86664947124065,
+      "grad_norm": 0.5689948797225952,
+      "kl": 0.1351318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0082,
+      "num_tokens": 331957618.0,
+      "reward": 1.630357265472412,
+      "reward_std": 0.22884601354599,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6303571462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.40131300687789917,
+      "step": 2778
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2850.0,
+      "completions/mean_length": 929.6160888671875,
+      "completions/mean_terminated_length": 872.04541015625,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 2.8676811968016507,
+      "grad_norm": 0.4698331356048584,
+      "kl": 0.1124267578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0131,
+      "num_tokens": 332132715.0,
+      "reward": 1.5125000476837158,
+      "reward_std": 0.19689759612083435,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.512499988079071,
+      "rewards/curriculum_aware_reward_fn/std": 0.39045479893684387,
+      "step": 2779
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2132.0,
+      "completions/max_terminated_length": 2132.0,
+      "completions/mean_length": 624.1607666015625,
+      "completions/mean_terminated_length": 624.1607666015625,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.8687129223626515,
+      "grad_norm": 0.560449481010437,
+      "kl": 0.1348876953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 332272866.0,
+      "reward": 1.7741073369979858,
+      "reward_std": 0.14867447316646576,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7741070985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.36304405331611633,
+      "step": 2780
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2469.0,
+      "completions/max_terminated_length": 2469.0,
+      "completions/mean_length": 788.3482666015625,
+      "completions/mean_terminated_length": 788.3482666015625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "epoch": 2.8697446479236524,
+      "grad_norm": 0.6889148354530334,
+      "kl": 0.1185302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.028,
+      "num_tokens": 332425309.0,
+      "reward": 1.471428632736206,
+      "reward_std": 0.25298944115638733,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4599605202674866,
+      "step": 2781
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3227.0,
+      "completions/mean_length": 819.2767944335938,
+      "completions/mean_terminated_length": 789.7567749023438,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 2.870776373484653,
+      "grad_norm": 0.5370697975158691,
+      "kl": 0.124267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0406,
+      "num_tokens": 332581454.0,
+      "reward": 1.4379466772079468,
+      "reward_std": 0.24798668920993805,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4379464089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.43724387884140015,
+      "step": 2782
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3573.0,
+      "completions/max_terminated_length": 3573.0,
+      "completions/mean_length": 672.4375,
+      "completions/mean_terminated_length": 672.4375,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 2.871808099045654,
+      "grad_norm": 0.5681089758872986,
+      "kl": 0.1175537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0119,
+      "num_tokens": 332718437.0,
+      "reward": 1.7406251430511475,
+      "reward_std": 0.12895211577415466,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7406249642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.37141871452331543,
+      "step": 2783
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3656.0,
+      "completions/max_terminated_length": 3656.0,
+      "completions/mean_length": 722.4732666015625,
+      "completions/mean_terminated_length": 722.4732666015625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 2.872839824606655,
+      "grad_norm": 0.4188723862171173,
+      "kl": 0.1199951171875,
+      "learning_rate": 1e-06,
+      "loss": 0.055,
+      "num_tokens": 332865182.0,
+      "reward": 1.532589316368103,
+      "reward_std": 0.09648028761148453,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.45031070709228516,
+      "step": 2784
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2457.0,
+      "completions/max_terminated_length": 2457.0,
+      "completions/mean_length": 710.6339721679688,
+      "completions/mean_terminated_length": 710.6339721679688,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.8738715501676553,
+      "grad_norm": 0.6341184973716736,
+      "kl": 0.1148681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.034,
+      "num_tokens": 333005939.0,
+      "reward": 1.5611608028411865,
+      "reward_std": 0.2271462231874466,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5611607432365417,
+      "rewards/curriculum_aware_reward_fn/std": 0.4104880392551422,
+      "step": 2785
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 4080.0,
+      "completions/max_terminated_length": 4080.0,
+      "completions/mean_length": 785.3482666015625,
+      "completions/mean_terminated_length": 785.3482666015625,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "epoch": 2.874903275728656,
+      "grad_norm": 0.5458928942680359,
+      "kl": 0.1356201171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0053,
+      "num_tokens": 333163331.0,
+      "reward": 1.3892858028411865,
+      "reward_std": 0.22566410899162292,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39821428060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.43308696150779724,
+      "step": 2786
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2467.0,
+      "completions/max_terminated_length": 2467.0,
+      "completions/mean_length": 808.5535888671875,
+      "completions/mean_terminated_length": 808.5535888671875,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.875935001289657,
+      "grad_norm": 0.6074646711349487,
+      "kl": 0.12451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0106,
+      "num_tokens": 333327725.0,
+      "reward": 1.5156251192092896,
+      "reward_std": 0.21479451656341553,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5245535969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.41708964109420776,
+      "step": 2787
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2659.0,
+      "completions/mean_length": 827.1964721679688,
+      "completions/mean_terminated_length": 767.7636108398438,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 2.876966726850658,
+      "grad_norm": 0.5130965113639832,
+      "kl": 0.1142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0503,
+      "num_tokens": 333495621.0,
+      "reward": 1.5142858028411865,
+      "reward_std": 0.22739137709140778,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5142857432365417,
+      "rewards/curriculum_aware_reward_fn/std": 0.43472206592559814,
+      "step": 2788
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2858.0,
+      "completions/mean_length": 929.4732666015625,
+      "completions/mean_terminated_length": 900.9459838867188,
+      "completions/min_length": 299.0,
+      "completions/min_terminated_length": 299.0,
+      "epoch": 2.8779984524116586,
+      "grad_norm": 0.5246358513832092,
+      "kl": 0.1146240234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0464,
+      "num_tokens": 333678829.0,
+      "reward": 1.4263393878936768,
+      "reward_std": 0.26658251881599426,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4434019923210144,
+      "step": 2789
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1959.0,
+      "completions/max_terminated_length": 1959.0,
+      "completions/mean_length": 650.1428833007812,
+      "completions/mean_terminated_length": 650.1428833007812,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 2.879030177972659,
+      "grad_norm": 0.6150388121604919,
+      "kl": 0.1239013671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0082,
+      "num_tokens": 333819846.0,
+      "reward": 1.581696629524231,
+      "reward_std": 0.2450869381427765,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5816964507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.42833197116851807,
+      "step": 2790
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2465.0,
+      "completions/mean_length": 782.1964721679688,
+      "completions/mean_terminated_length": 752.3423461914062,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 2.88006190353366,
+      "grad_norm": 0.47666504979133606,
+      "kl": 0.1256103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.003,
+      "num_tokens": 333977641.0,
+      "reward": 1.5531251430511475,
+      "reward_std": 0.1824059635400772,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5531250238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.3910762071609497,
+      "step": 2791
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3040.0,
+      "completions/max_terminated_length": 3040.0,
+      "completions/mean_length": 796.982177734375,
+      "completions/mean_terminated_length": 796.982177734375,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 2.8810936290946607,
+      "grad_norm": 0.5374991297721863,
+      "kl": 0.1209716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0343,
+      "num_tokens": 334127880.0,
+      "reward": 1.5241073369979858,
+      "reward_std": 0.22164270281791687,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5241070985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.4269091784954071,
+      "step": 2792
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3287.0,
+      "completions/max_terminated_length": 3287.0,
+      "completions/mean_length": 786.6250610351562,
+      "completions/mean_terminated_length": 786.6250610351562,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 2.8821253546556616,
+      "grad_norm": 0.5066479444503784,
+      "kl": 0.1220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0033,
+      "num_tokens": 334284099.0,
+      "reward": 1.5000001192092896,
+      "reward_std": 0.1792917400598526,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5,
+      "rewards/curriculum_aware_reward_fn/std": 0.4358898997306824,
+      "step": 2793
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2175.0,
+      "completions/max_terminated_length": 2175.0,
+      "completions/mean_length": 705.2232666015625,
+      "completions/mean_terminated_length": 705.2232666015625,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.8831570802166624,
+      "grad_norm": 0.6270851492881775,
+      "kl": 0.120361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0432,
+      "num_tokens": 334421051.0,
+      "reward": 1.5513395071029663,
+      "reward_std": 0.21944689750671387,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5513392686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.3947550058364868,
+      "step": 2794
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3129.0,
+      "completions/max_terminated_length": 3129.0,
+      "completions/mean_length": 823.3392944335938,
+      "completions/mean_terminated_length": 823.3392944335938,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 2.8841888057776632,
+      "grad_norm": 0.5626925826072693,
+      "kl": 0.1043701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 334585281.0,
+      "reward": 1.4611607789993286,
+      "reward_std": 0.11960483342409134,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47008928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4516465961933136,
+      "step": 2795
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2989.0,
+      "completions/max_terminated_length": 2989.0,
+      "completions/mean_length": 820.4464721679688,
+      "completions/mean_terminated_length": 820.4464721679688,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.885220531338664,
+      "grad_norm": 0.5923090577125549,
+      "kl": 0.1287841796875,
+      "learning_rate": 1e-06,
+      "loss": -0.001,
+      "num_tokens": 334744364.0,
+      "reward": 1.5718750953674316,
+      "reward_std": 0.3103684186935425,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5718750357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.4846648871898651,
+      "step": 2796
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3144.0,
+      "completions/mean_length": 881.0714721679688,
+      "completions/mean_terminated_length": 852.108154296875,
+      "completions/min_length": 258.0,
+      "completions/min_terminated_length": 258.0,
+      "epoch": 2.886252256899665,
+      "grad_norm": 0.53386390209198,
+      "kl": 0.120849609375,
+      "learning_rate": 1e-06,
+      "loss": -0.0667,
+      "num_tokens": 334909243.0,
+      "reward": 1.4450894594192505,
+      "reward_std": 0.16090065240859985,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44508931040763855,
+      "rewards/curriculum_aware_reward_fn/std": 0.4154103398323059,
+      "step": 2797
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3976.0,
+      "completions/max_terminated_length": 3976.0,
+      "completions/mean_length": 712.8750610351562,
+      "completions/mean_terminated_length": 712.8750610351562,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 2.8872839824606653,
+      "grad_norm": 0.6113415956497192,
+      "kl": 0.1234130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 335048200.0,
+      "reward": 1.5406252145767212,
+      "reward_std": 0.16090570390224457,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5495535731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4137009382247925,
+      "step": 2798
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3375.0,
+      "completions/mean_length": 783.0267944335938,
+      "completions/mean_terminated_length": 753.18017578125,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 2.888315708021666,
+      "grad_norm": 0.5956709384918213,
+      "kl": 0.1158447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0327,
+      "num_tokens": 335197856.0,
+      "reward": 1.5160715579986572,
+      "reward_std": 0.17236091196537018,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5249999761581421,
+      "rewards/curriculum_aware_reward_fn/std": 0.4052026569843292,
+      "step": 2799
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1861.0,
+      "completions/max_terminated_length": 1861.0,
+      "completions/mean_length": 654.1964721679688,
+      "completions/mean_terminated_length": 654.1964721679688,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 2.889347433582667,
+      "grad_norm": 0.6350661516189575,
+      "kl": 0.131103515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0495,
+      "num_tokens": 335340475.0,
+      "reward": 1.6669644117355347,
+      "reward_std": 0.2553524374961853,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6669642329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.5253721475601196,
+      "step": 2800
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2500.0,
+      "completions/mean_length": 702.3928833007812,
+      "completions/mean_terminated_length": 671.81982421875,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 2.890379159143668,
+      "grad_norm": 0.49716106057167053,
+      "kl": 0.1234130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0097,
+      "num_tokens": 335487652.0,
+      "reward": 1.5357143878936768,
+      "reward_std": 0.1621207445859909,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.4404353201389313,
+      "step": 2801
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2283.0,
+      "completions/max_terminated_length": 2283.0,
+      "completions/mean_length": 720.2767944335938,
+      "completions/mean_terminated_length": 720.2767944335938,
+      "completions/min_length": 283.0,
+      "completions/min_terminated_length": 283.0,
+      "epoch": 2.8914108847046687,
+      "grad_norm": 0.5649811625480652,
+      "kl": 0.1072998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.002,
+      "num_tokens": 335639044.0,
+      "reward": 1.4316965341567993,
+      "reward_std": 0.16086453199386597,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43169641494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.45564502477645874,
+      "step": 2802
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2778.0,
+      "completions/max_terminated_length": 2778.0,
+      "completions/mean_length": 695.732177734375,
+      "completions/mean_terminated_length": 695.732177734375,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 2.892442610265669,
+      "grad_norm": 0.5937812328338623,
+      "kl": 0.13037109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0075,
+      "num_tokens": 335782052.0,
+      "reward": 1.5696429014205933,
+      "reward_std": 0.26064127683639526,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5785714387893677,
+      "rewards/curriculum_aware_reward_fn/std": 0.4103785455226898,
+      "step": 2803
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1304.0,
+      "completions/max_terminated_length": 1304.0,
+      "completions/mean_length": 562.3125,
+      "completions/mean_terminated_length": 562.3125,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 2.89347433582667,
+      "grad_norm": 0.6014729142189026,
+      "kl": 0.142822265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0243,
+      "num_tokens": 335903824.0,
+      "reward": 1.4691965579986572,
+      "reward_std": 0.17668238282203674,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46919646859169006,
+      "rewards/curriculum_aware_reward_fn/std": 0.4110109210014343,
+      "step": 2804
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1903.0,
+      "completions/mean_length": 748.5803833007812,
+      "completions/mean_terminated_length": 718.4234619140625,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 2.8945060613876707,
+      "grad_norm": 0.5668159127235413,
+      "kl": 0.12548828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0038,
+      "num_tokens": 336058855.0,
+      "reward": 1.516964316368103,
+      "reward_std": 0.16923758387565613,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5169642567634583,
+      "rewards/curriculum_aware_reward_fn/std": 0.3730536997318268,
+      "step": 2805
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2043.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 551.2589721679688,
+      "completions/mean_terminated_length": 551.2589721679688,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 2.8955377869486716,
+      "grad_norm": 0.603298544883728,
+      "kl": 0.13525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0425,
+      "num_tokens": 336180509.0,
+      "reward": 1.6491073369979858,
+      "reward_std": 0.14685435593128204,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6491071581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.40730834007263184,
+      "step": 2806
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2774.0,
+      "completions/mean_length": 750.1785888671875,
+      "completions/mean_terminated_length": 720.0360717773438,
+      "completions/min_length": 123.0,
+      "completions/min_terminated_length": 123.0,
+      "epoch": 2.8965695125096724,
+      "grad_norm": 0.6138197779655457,
+      "kl": 0.1397705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0226,
+      "num_tokens": 336322209.0,
+      "reward": 1.5781251192092896,
+      "reward_std": 0.23737137019634247,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.578125,
+      "rewards/curriculum_aware_reward_fn/std": 0.40829136967658997,
+      "step": 2807
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3321.0,
+      "completions/max_terminated_length": 3321.0,
+      "completions/mean_length": 701.794677734375,
+      "completions/mean_terminated_length": 701.794677734375,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.8976012380706733,
+      "grad_norm": 0.4589069187641144,
+      "kl": 0.11572265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0257,
+      "num_tokens": 336471041.0,
+      "reward": 1.4861607551574707,
+      "reward_std": 0.17394407093524933,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.4386177957057953,
+      "step": 2808
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2001.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 665.9017944335938,
+      "completions/mean_terminated_length": 665.9017944335938,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 2.898632963631674,
+      "grad_norm": 0.5997697710990906,
+      "kl": 0.128173828125,
+      "learning_rate": 1e-06,
+      "loss": 0.012,
+      "num_tokens": 336614026.0,
+      "reward": 1.4651787281036377,
+      "reward_std": 0.25731945037841797,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46517857909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.41928935050964355,
+      "step": 2809
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1818.0,
+      "completions/max_terminated_length": 1818.0,
+      "completions/mean_length": 557.6160888671875,
+      "completions/mean_terminated_length": 557.6160888671875,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 2.899664689192675,
+      "grad_norm": 0.6498278975486755,
+      "kl": 0.144287109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0003,
+      "num_tokens": 336741677.0,
+      "reward": 1.740625023841858,
+      "reward_std": 0.16024711728096008,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7406249642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.34838956594467163,
+      "step": 2810
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3370.0,
+      "completions/max_terminated_length": 3370.0,
+      "completions/mean_length": 698.1964721679688,
+      "completions/mean_terminated_length": 698.1964721679688,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 2.900696414753676,
+      "grad_norm": 0.5694387555122375,
+      "kl": 0.121337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0339,
+      "num_tokens": 336887621.0,
+      "reward": 1.4205358028411865,
+      "reward_std": 0.21582452952861786,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42053571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.425618439912796,
+      "step": 2811
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2319.0,
+      "completions/max_terminated_length": 2319.0,
+      "completions/mean_length": 653.6785888671875,
+      "completions/mean_terminated_length": 653.6785888671875,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 2.901728140314676,
+      "grad_norm": 0.6535736918449402,
+      "kl": 0.1265869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0065,
+      "num_tokens": 337025618.0,
+      "reward": 1.5375001430511475,
+      "reward_std": 0.19925980269908905,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5374999642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.4090474545955658,
+      "step": 2812
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2772.0,
+      "completions/max_terminated_length": 2772.0,
+      "completions/mean_length": 771.8303833007812,
+      "completions/mean_terminated_length": 771.8303833007812,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 2.902759865875677,
+      "grad_norm": 0.44244030117988586,
+      "kl": 0.116455078125,
+      "learning_rate": 1e-06,
+      "loss": -0.042,
+      "num_tokens": 337180082.0,
+      "reward": 1.5151785612106323,
+      "reward_std": 0.20895953476428986,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5241071581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.4499240219593048,
+      "step": 2813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3413.0,
+      "completions/max_terminated_length": 3413.0,
+      "completions/mean_length": 795.6160888671875,
+      "completions/mean_terminated_length": 795.6160888671875,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "epoch": 2.903791591436678,
+      "grad_norm": 0.4640490412712097,
+      "kl": 0.125244140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0219,
+      "num_tokens": 337326080.0,
+      "reward": 1.4522321224212646,
+      "reward_std": 0.13497215509414673,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45223212242126465,
+      "rewards/curriculum_aware_reward_fn/std": 0.462191641330719,
+      "step": 2814
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2401.0,
+      "completions/max_terminated_length": 2401.0,
+      "completions/mean_length": 745.0357666015625,
+      "completions/mean_terminated_length": 745.0357666015625,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 2.9048233169976787,
+      "grad_norm": 0.5465977191925049,
+      "kl": 0.117431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 337480854.0,
+      "reward": 1.5299108028411865,
+      "reward_std": 0.1310456395149231,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.529910683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.4423249363899231,
+      "step": 2815
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3062.0,
+      "completions/max_terminated_length": 3062.0,
+      "completions/mean_length": 760.732177734375,
+      "completions/mean_terminated_length": 760.732177734375,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "epoch": 2.9058550425586795,
+      "grad_norm": 0.6337264776229858,
+      "kl": 0.1339111328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0207,
+      "num_tokens": 337628125.0,
+      "reward": 1.5000001192092896,
+      "reward_std": 0.24028640985488892,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5,
+      "rewards/curriculum_aware_reward_fn/std": 0.37835583090782166,
+      "step": 2816
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3695.0,
+      "completions/mean_length": 868.7767944335938,
+      "completions/mean_terminated_length": 839.7026977539062,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "epoch": 2.90688676811968,
+      "grad_norm": 0.49252942204475403,
+      "kl": 0.114013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0217,
+      "num_tokens": 337793615.0,
+      "reward": 1.4214287996292114,
+      "reward_std": 0.22298011183738708,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4214285910129547,
+      "rewards/curriculum_aware_reward_fn/std": 0.37414854764938354,
+      "step": 2817
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2959.0,
+      "completions/mean_length": 753.9375610351562,
+      "completions/mean_terminated_length": 723.828857421875,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 2.907918493680681,
+      "grad_norm": 0.49966421723365784,
+      "kl": 0.1248779296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0143,
+      "num_tokens": 337950723.0,
+      "reward": 1.646875023841858,
+      "reward_std": 0.17419442534446716,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6558035612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.3948283791542053,
+      "step": 2818
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3673.0,
+      "completions/max_terminated_length": 3673.0,
+      "completions/mean_length": 786.0982666015625,
+      "completions/mean_terminated_length": 786.0982666015625,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 2.9089502192416816,
+      "grad_norm": 0.5664613842964172,
+      "kl": 0.1182861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0722,
+      "num_tokens": 338106337.0,
+      "reward": 1.5683037042617798,
+      "reward_std": 0.25919803977012634,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5772320628166199,
+      "rewards/curriculum_aware_reward_fn/std": 0.40502965450286865,
+      "step": 2819
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3216.0,
+      "completions/mean_length": 950.0714721679688,
+      "completions/mean_terminated_length": 892.8726806640625,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 2.9099819448026825,
+      "grad_norm": 0.46091020107269287,
+      "kl": 0.1280517578125,
+      "learning_rate": 1e-06,
+      "loss": -0.0054,
+      "num_tokens": 338281478.0,
+      "reward": 1.6665180921554565,
+      "reward_std": 0.20088189840316772,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6665177941322327,
+      "rewards/curriculum_aware_reward_fn/std": 0.4047793447971344,
+      "step": 2820
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3970.0,
+      "completions/mean_length": 981.2500610351562,
+      "completions/mean_terminated_length": 895.5228881835938,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 2.9110136703636833,
+      "grad_norm": 0.4261971116065979,
+      "kl": 0.109619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0441,
+      "num_tokens": 338466243.0,
+      "reward": 1.5625001192092896,
+      "reward_std": 0.12906794250011444,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5625,
+      "rewards/curriculum_aware_reward_fn/std": 0.41058626770973206,
+      "step": 2821
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2439.0,
+      "completions/max_terminated_length": 2439.0,
+      "completions/mean_length": 724.9107666015625,
+      "completions/mean_terminated_length": 724.9107666015625,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 2.912045395924684,
+      "grad_norm": 0.5682430267333984,
+      "kl": 0.1474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0154,
+      "num_tokens": 338605568.0,
+      "reward": 1.591071605682373,
+      "reward_std": 0.16823892295360565,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5910714268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.3809787333011627,
+      "step": 2822
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3659.0,
+      "completions/max_terminated_length": 3659.0,
+      "completions/mean_length": 943.0357666015625,
+      "completions/mean_terminated_length": 943.0357666015625,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 2.913077121485685,
+      "grad_norm": 0.5692936182022095,
+      "kl": 0.123046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0234,
+      "num_tokens": 338785388.0,
+      "reward": 1.5294643640518188,
+      "reward_std": 0.2762223482131958,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5294643044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.41206759214401245,
+      "step": 2823
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3892.0,
+      "completions/mean_length": 930.8125610351562,
+      "completions/mean_terminated_length": 873.2636108398438,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "epoch": 2.914108847046686,
+      "grad_norm": 0.6045404076576233,
+      "kl": 0.130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.066,
+      "num_tokens": 338955327.0,
+      "reward": 1.4459823369979858,
+      "reward_std": 0.21156080067157745,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4459821581840515,
+      "rewards/curriculum_aware_reward_fn/std": 0.3662654459476471,
+      "step": 2824
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3253.0,
+      "completions/mean_length": 999.607177734375,
+      "completions/mean_terminated_length": 914.38525390625,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 2.915140572607686,
+      "grad_norm": 0.3792083263397217,
+      "kl": 0.1165771484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0088,
+      "num_tokens": 339142323.0,
+      "reward": 1.6156251430511475,
+      "reward_std": 0.11980683356523514,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6156249642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.5581110119819641,
+      "step": 2825
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3649.0,
+      "completions/mean_length": 964.4107666015625,
+      "completions/mean_terminated_length": 878.2201538085938,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 2.916172298168687,
+      "grad_norm": 0.5529889464378357,
+      "kl": 0.1221923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0369,
+      "num_tokens": 339312434.0,
+      "reward": 1.3674108982086182,
+      "reward_std": 0.2013842761516571,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.3771460950374603,
+      "step": 2826
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3098.0,
+      "completions/mean_length": 1062.6785888671875,
+      "completions/mean_terminated_length": 1035.351318359375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 2.917204023729688,
+      "grad_norm": 0.5321182012557983,
+      "kl": 0.1229248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0096,
+      "num_tokens": 339513739.0,
+      "reward": 1.4044643640518188,
+      "reward_std": 0.2600024342536926,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4044642746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.416038453578949,
+      "step": 2827
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3805.0,
+      "completions/mean_length": 987.0535888671875,
+      "completions/mean_terminated_length": 959.0450439453125,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 2.9182357492906887,
+      "grad_norm": 0.5274437665939331,
+      "kl": 0.1236572265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0428,
+      "num_tokens": 339698146.0,
+      "reward": 1.4517858028411865,
+      "reward_std": 0.21504361927509308,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45178571343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.5527135729789734,
+      "step": 2828
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3789.0,
+      "completions/mean_length": 947.3125610351562,
+      "completions/mean_terminated_length": 860.6513671875,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 2.9192674748516896,
+      "grad_norm": 0.5115098357200623,
+      "kl": 0.1151123046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0245,
+      "num_tokens": 339882113.0,
+      "reward": 1.4446429014205933,
+      "reward_std": 0.18274597823619843,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4446428716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.4277195334434509,
+      "step": 2829
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2271.0,
+      "completions/mean_length": 755.4107666015625,
+      "completions/mean_terminated_length": 725.3153076171875,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.92029920041269,
+      "grad_norm": 0.487190306186676,
+      "kl": 0.1287841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0334,
+      "num_tokens": 340031308.0,
+      "reward": 1.6031250953674316,
+      "reward_std": 0.17092326283454895,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6031250357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.44262126088142395,
+      "step": 2830
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2653.0,
+      "completions/mean_length": 825.1160888671875,
+      "completions/mean_terminated_length": 795.648681640625,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 2.921330925973691,
+      "grad_norm": 0.6423907279968262,
+      "kl": 0.129638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0496,
+      "num_tokens": 340195310.0,
+      "reward": 1.6241074800491333,
+      "reward_std": 0.18456201255321503,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6330357193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.3969792127609253,
+      "step": 2831
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3153.0,
+      "completions/mean_length": 932.15185546875,
+      "completions/mean_terminated_length": 903.648681640625,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "epoch": 2.9223626515346917,
+      "grad_norm": 0.5027857422828674,
+      "kl": 0.126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0407,
+      "num_tokens": 340367212.0,
+      "reward": 1.581696629524231,
+      "reward_std": 0.16816502809524536,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5816964507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.45227134227752686,
+      "step": 2832
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2578.0,
+      "completions/max_terminated_length": 2578.0,
+      "completions/mean_length": 864.6428833007812,
+      "completions/mean_terminated_length": 864.6428833007812,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.9233943770956925,
+      "grad_norm": 0.5807149410247803,
+      "kl": 0.13818359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0229,
+      "num_tokens": 340532652.0,
+      "reward": 1.5982143878936768,
+      "reward_std": 0.21378812193870544,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5982142686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.39931970834732056,
+      "step": 2833
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3418.0,
+      "completions/mean_length": 1074.1160888671875,
+      "completions/mean_terminated_length": 990.9448852539062,
+      "completions/min_length": 172.0,
+      "completions/min_terminated_length": 172.0,
+      "epoch": 2.9244261026566933,
+      "grad_norm": 0.5498697757720947,
+      "kl": 0.1300048828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0208,
+      "num_tokens": 340714651.0,
+      "reward": 1.7075893878936768,
+      "reward_std": 0.2345176488161087,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7165178656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3282075524330139,
+      "step": 2834
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3567.0,
+      "completions/mean_length": 1098.1875,
+      "completions/mean_terminated_length": 1043.6817626953125,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 2.925457828217694,
+      "grad_norm": 0.5744073987007141,
+      "kl": 0.12158203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0426,
+      "num_tokens": 340906136.0,
+      "reward": 1.4392857551574707,
+      "reward_std": 0.18629738688468933,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4392856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.37945127487182617,
+      "step": 2835
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2680.0,
+      "completions/max_terminated_length": 2680.0,
+      "completions/mean_length": 968.0982666015625,
+      "completions/mean_terminated_length": 968.0982666015625,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 2.926489553778695,
+      "grad_norm": 0.4878877103328705,
+      "kl": 0.124755859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0356,
+      "num_tokens": 341083996.0,
+      "reward": 1.4049108028411865,
+      "reward_std": 0.24925042688846588,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41383928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.41844895482063293,
+      "step": 2836
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0357142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3865.0,
+      "completions/mean_length": 1233.8035888671875,
+      "completions/mean_terminated_length": 1127.7962646484375,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 2.927521279339696,
+      "grad_norm": 0.4677523672580719,
+      "kl": 0.123046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0649,
+      "num_tokens": 341291300.0,
+      "reward": 1.5093750953674316,
+      "reward_std": 0.24394011497497559,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5093749761581421,
+      "rewards/curriculum_aware_reward_fn/std": 0.428202360868454,
+      "step": 2837
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3826.0,
+      "completions/mean_length": 1228.982177734375,
+      "completions/mean_terminated_length": 1203.1531982421875,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 2.9285530049006963,
+      "grad_norm": 0.41032853722572327,
+      "kl": 0.107666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 341499247.0,
+      "reward": 1.471428632736206,
+      "reward_std": 0.20130853354930878,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47142860293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.42994415760040283,
+      "step": 2838
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3315.0,
+      "completions/max_terminated_length": 3315.0,
+      "completions/mean_length": 947.2232666015625,
+      "completions/mean_terminated_length": 947.2232666015625,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 2.929584730461697,
+      "grad_norm": 0.5380738973617554,
+      "kl": 0.12353515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0548,
+      "num_tokens": 341665018.0,
+      "reward": 1.5549108982086182,
+      "reward_std": 0.22859680652618408,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.563839316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.4066568613052368,
+      "step": 2839
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3756.0,
+      "completions/mean_length": 1066.46435546875,
+      "completions/mean_terminated_length": 983.08251953125,
+      "completions/min_length": 293.0,
+      "completions/min_terminated_length": 293.0,
+      "epoch": 2.930616456022698,
+      "grad_norm": 0.4303039610385895,
+      "kl": 0.1212158203125,
+      "learning_rate": 1e-06,
+      "loss": -0.0139,
+      "num_tokens": 341859537.0,
+      "reward": 1.599107265472412,
+      "reward_std": 0.14463859796524048,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5991071462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.4139994978904724,
+      "step": 2840
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3358.0,
+      "completions/max_terminated_length": 3358.0,
+      "completions/mean_length": 786.9910888671875,
+      "completions/mean_terminated_length": 786.9910888671875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 2.9316481815836988,
+      "grad_norm": 0.5593550205230713,
+      "kl": 0.13818359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0497,
+      "num_tokens": 342008076.0,
+      "reward": 1.612053632736206,
+      "reward_std": 0.1519756317138672,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6120535731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4075440466403961,
+      "step": 2841
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3221.0,
+      "completions/mean_length": 1076.33935546875,
+      "completions/mean_terminated_length": 1021.4363403320312,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "epoch": 2.9326799071446996,
+      "grad_norm": 0.4824768006801605,
+      "kl": 0.129638671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0626,
+      "num_tokens": 342193264.0,
+      "reward": 1.5526785850524902,
+      "reward_std": 0.25366339087486267,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5705357193946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.4081684947013855,
+      "step": 2842
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3469.0,
+      "completions/mean_length": 1164.2679443359375,
+      "completions/mean_terminated_length": 1110.963623046875,
+      "completions/min_length": 253.0,
+      "completions/min_terminated_length": 253.0,
+      "epoch": 2.9337116327057,
+      "grad_norm": 0.5000720024108887,
+      "kl": 0.1190185546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0064,
+      "num_tokens": 342397911.0,
+      "reward": 1.372321605682373,
+      "reward_std": 0.24542978405952454,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3812499940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.41509386897087097,
+      "step": 2843
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3398.0,
+      "completions/mean_length": 802.9285888671875,
+      "completions/mean_terminated_length": 773.2612915039062,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 2.934743358266701,
+      "grad_norm": 0.4824245572090149,
+      "kl": 0.140869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0265,
+      "num_tokens": 342545232.0,
+      "reward": 1.7607142925262451,
+      "reward_std": 0.14560459554195404,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7607142329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.37712928652763367,
+      "step": 2844
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3877.0,
+      "completions/mean_length": 1002.3660888671875,
+      "completions/mean_terminated_length": 946.1181640625,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 2.9357750838277017,
+      "grad_norm": 0.4928884208202362,
+      "kl": 0.124755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0697,
+      "num_tokens": 342713812.0,
+      "reward": 1.5191963911056519,
+      "reward_std": 0.16029249131679535,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5191964507102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.44485270977020264,
+      "step": 2845
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2634.0,
+      "completions/mean_length": 991.1250610351562,
+      "completions/mean_terminated_length": 905.669677734375,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.9368068093887025,
+      "grad_norm": 0.5415922999382019,
+      "kl": 0.121337890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0152,
+      "num_tokens": 342893194.0,
+      "reward": 1.5223214626312256,
+      "reward_std": 0.1560944765806198,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5223214030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.3887980580329895,
+      "step": 2846
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3737.0,
+      "completions/mean_length": 981.9910888671875,
+      "completions/mean_terminated_length": 953.9369506835938,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 2.9378385349497034,
+      "grad_norm": 0.5075610280036926,
+      "kl": 0.116943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0055,
+      "num_tokens": 343080191.0,
+      "reward": 1.4678572416305542,
+      "reward_std": 0.307505339384079,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46785715222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.4610435664653778,
+      "step": 2847
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4084.0,
+      "completions/mean_length": 972.3035888671875,
+      "completions/mean_terminated_length": 944.1621704101562,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 2.938870260510704,
+      "grad_norm": 0.4480708837509155,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.029,
+      "num_tokens": 343258900.0,
+      "reward": 1.5433037281036377,
+      "reward_std": 0.1520136296749115,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.561160683631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.36383283138275146,
+      "step": 2848
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2760.0,
+      "completions/max_terminated_length": 2760.0,
+      "completions/mean_length": 842.857177734375,
+      "completions/mean_terminated_length": 842.857177734375,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 2.939901986071705,
+      "grad_norm": 0.5809365510940552,
+      "kl": 0.143798828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0887,
+      "num_tokens": 343417287.0,
+      "reward": 1.5294644832611084,
+      "reward_std": 0.19331319630146027,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5294643044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.42243146896362305,
+      "step": 2849
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2616.0,
+      "completions/max_terminated_length": 2616.0,
+      "completions/mean_length": 855.7767944335938,
+      "completions/mean_terminated_length": 855.7767944335938,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 2.940933711632706,
+      "grad_norm": 0.5113884806632996,
+      "kl": 0.12109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0412,
+      "num_tokens": 343576869.0,
+      "reward": 1.41785728931427,
+      "reward_std": 0.19872523844242096,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4357142746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.44094642996788025,
+      "step": 2850
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2965.0,
+      "completions/max_terminated_length": 2965.0,
+      "completions/mean_length": 716.7500610351562,
+      "completions/mean_terminated_length": 716.7500610351562,
+      "completions/min_length": 143.0,
+      "completions/min_terminated_length": 143.0,
+      "epoch": 2.9419654371937067,
+      "grad_norm": 0.5477640628814697,
+      "kl": 0.140869140625,
+      "learning_rate": 1e-06,
+      "loss": -0.017,
+      "num_tokens": 343720810.0,
+      "reward": 1.7714285850524902,
+      "reward_std": 0.2532138526439667,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7714285254478455,
+      "rewards/curriculum_aware_reward_fn/std": 0.49190354347229004,
+      "step": 2851
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3282.0,
+      "completions/mean_length": 754.9553833007812,
+      "completions/mean_terminated_length": 724.8558959960938,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.942997162754707,
+      "grad_norm": 0.5419442057609558,
+      "kl": 0.13427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0707,
+      "num_tokens": 343859496.0,
+      "reward": 1.6598217487335205,
+      "reward_std": 0.18061259388923645,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.668749988079071,
+      "rewards/curriculum_aware_reward_fn/std": 0.5330147743225098,
+      "step": 2852
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2810.0,
+      "completions/max_terminated_length": 2810.0,
+      "completions/mean_length": 825.6339721679688,
+      "completions/mean_terminated_length": 825.6339721679688,
+      "completions/min_length": 150.0,
+      "completions/min_terminated_length": 150.0,
+      "epoch": 2.944028888315708,
+      "grad_norm": 0.5500017404556274,
+      "kl": 0.1317138671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0183,
+      "num_tokens": 344020070.0,
+      "reward": 1.6044644117355347,
+      "reward_std": 0.27769601345062256,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6133928298950195,
+      "rewards/curriculum_aware_reward_fn/std": 0.41924333572387695,
+      "step": 2853
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3986.0,
+      "completions/mean_length": 1099.1429443359375,
+      "completions/mean_terminated_length": 1016.6605224609375,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 2.945060613876709,
+      "grad_norm": 0.46461525559425354,
+      "kl": 0.1148681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0834,
+      "num_tokens": 344206762.0,
+      "reward": 1.4647324085235596,
+      "reward_std": 0.21833495795726776,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4736606776714325,
+      "rewards/curriculum_aware_reward_fn/std": 0.42542821168899536,
+      "step": 2854
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3170.0,
+      "completions/mean_length": 833.1785888671875,
+      "completions/mean_terminated_length": 803.7838134765625,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "epoch": 2.9460923394377097,
+      "grad_norm": 0.4217899739742279,
+      "kl": 0.126708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0102,
+      "num_tokens": 344364660.0,
+      "reward": 1.5799108743667603,
+      "reward_std": 0.15548360347747803,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5799107551574707,
+      "rewards/curriculum_aware_reward_fn/std": 0.4215191602706909,
+      "step": 2855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3976.0,
+      "completions/mean_length": 958.90185546875,
+      "completions/mean_terminated_length": 901.8635864257812,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "epoch": 2.9471240649987105,
+      "grad_norm": 0.4388851821422577,
+      "kl": 0.1221923828125,
+      "learning_rate": 1e-06,
+      "loss": -0.0618,
+      "num_tokens": 344538795.0,
+      "reward": 1.4928573369979858,
+      "reward_std": 0.12822991609573364,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5017856955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.4532131850719452,
+      "step": 2856
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2801.0,
+      "completions/mean_length": 834.2053833007812,
+      "completions/mean_terminated_length": 774.8999633789062,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 2.948155790559711,
+      "grad_norm": 0.5034121870994568,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0575,
+      "num_tokens": 344688492.0,
+      "reward": 1.7308037281036377,
+      "reward_std": 0.2394391894340515,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7397321462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.3914133608341217,
+      "step": 2857
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3984.0,
+      "completions/mean_length": 1040.169677734375,
+      "completions/mean_terminated_length": 984.6090698242188,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 2.9491875161207117,
+      "grad_norm": 0.42861661314964294,
+      "kl": 0.1171875,
+      "learning_rate": 1e-06,
+      "loss": 0.004,
+      "num_tokens": 344872023.0,
+      "reward": 1.3441965579986572,
+      "reward_std": 0.1831224113702774,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3441964089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.43468666076660156,
+      "step": 2858
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3575.0,
+      "completions/max_terminated_length": 3575.0,
+      "completions/mean_length": 857.857177734375,
+      "completions/mean_terminated_length": 857.857177734375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 2.9502192416817126,
+      "grad_norm": 0.4969906806945801,
+      "kl": 0.1148681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0681,
+      "num_tokens": 345027057.0,
+      "reward": 1.5781251192092896,
+      "reward_std": 0.16588565707206726,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5870535969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.40374070405960083,
+      "step": 2859
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3894.0,
+      "completions/mean_length": 780.8392944335938,
+      "completions/mean_terminated_length": 750.9729614257812,
+      "completions/min_length": 154.0,
+      "completions/min_terminated_length": 154.0,
+      "epoch": 2.9512509672427134,
+      "grad_norm": 0.4635484516620636,
+      "kl": 0.12744140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0418,
+      "num_tokens": 345183251.0,
+      "reward": 1.5647321939468384,
+      "reward_std": 0.07009764015674591,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5647321343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.46443989872932434,
+      "step": 2860
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1736.0,
+      "completions/mean_length": 652.5982666015625,
+      "completions/mean_terminated_length": 589.9909057617188,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 2.9522826928037142,
+      "grad_norm": 0.6120730638504028,
+      "kl": 0.1446533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0208,
+      "num_tokens": 345320483.0,
+      "reward": 1.5982145071029663,
+      "reward_std": 0.1766142100095749,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6160714030265808,
+      "rewards/curriculum_aware_reward_fn/std": 0.43827131390571594,
+      "step": 2861
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3693.0,
+      "completions/mean_length": 883.9910888671875,
+      "completions/mean_terminated_length": 825.5908813476562,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 2.953314418364715,
+      "grad_norm": 0.5264288783073425,
+      "kl": 0.107177734375,
+      "learning_rate": 1e-06,
+      "loss": -0.0233,
+      "num_tokens": 345486739.0,
+      "reward": 1.5410715341567993,
+      "reward_std": 0.19420649111270905,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5410714149475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.45198333263397217,
+      "step": 2862
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3149.0,
+      "completions/max_terminated_length": 3149.0,
+      "completions/mean_length": 692.7142944335938,
+      "completions/mean_terminated_length": 692.7142944335938,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 2.954346143925716,
+      "grad_norm": 0.6059958338737488,
+      "kl": 0.13427734375,
+      "learning_rate": 1e-06,
+      "loss": -0.007,
+      "num_tokens": 345622780.0,
+      "reward": 1.7245537042617798,
+      "reward_std": 0.24420930445194244,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7334821820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.381579726934433,
+      "step": 2863
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3316.0,
+      "completions/mean_length": 1037.625,
+      "completions/mean_terminated_length": 1010.0720825195312,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "epoch": 2.9553778694867168,
+      "grad_norm": 0.5002031922340393,
+      "kl": 0.1168212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0154,
+      "num_tokens": 345811500.0,
+      "reward": 1.4736608266830444,
+      "reward_std": 0.22914379835128784,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47366073727607727,
+      "rewards/curriculum_aware_reward_fn/std": 0.39881354570388794,
+      "step": 2864
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1879.0,
+      "completions/max_terminated_length": 1879.0,
+      "completions/mean_length": 682.794677734375,
+      "completions/mean_terminated_length": 682.794677734375,
+      "completions/min_length": 133.0,
+      "completions/min_terminated_length": 133.0,
+      "epoch": 2.956409595047717,
+      "grad_norm": 0.5402216911315918,
+      "kl": 0.12255859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0035,
+      "num_tokens": 345950497.0,
+      "reward": 1.5294643640518188,
+      "reward_std": 0.1578807830810547,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5383928418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4456993043422699,
+      "step": 2865
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3317.0,
+      "completions/max_terminated_length": 3317.0,
+      "completions/mean_length": 645.2589721679688,
+      "completions/mean_terminated_length": 645.2589721679688,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 2.957441320608718,
+      "grad_norm": 0.7363216280937195,
+      "kl": 0.1314697265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0286,
+      "num_tokens": 346073054.0,
+      "reward": 1.5375001430511475,
+      "reward_std": 0.22698816657066345,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5375000238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.3998591899871826,
+      "step": 2866
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3290.0,
+      "completions/mean_length": 780.4464721679688,
+      "completions/mean_terminated_length": 750.5765991210938,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.958473046169719,
+      "grad_norm": 0.5626248717308044,
+      "kl": 0.122314453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0016,
+      "num_tokens": 346223968.0,
+      "reward": 1.4299107789993286,
+      "reward_std": 0.22761982679367065,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42991071939468384,
+      "rewards/curriculum_aware_reward_fn/std": 0.4294591248035431,
+      "step": 2867
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3555.0,
+      "completions/max_terminated_length": 3555.0,
+      "completions/mean_length": 774.9285888671875,
+      "completions/mean_terminated_length": 774.9285888671875,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 2.9595047717307197,
+      "grad_norm": 0.633262038230896,
+      "kl": 0.13623046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0027,
+      "num_tokens": 346375412.0,
+      "reward": 1.4750001430511475,
+      "reward_std": 0.26338452100753784,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48392853140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.3957666754722595,
+      "step": 2868
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3717.0,
+      "completions/max_terminated_length": 3717.0,
+      "completions/mean_length": 777.6428833007812,
+      "completions/mean_terminated_length": 777.6428833007812,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "epoch": 2.9605364972917205,
+      "grad_norm": 0.5764303207397461,
+      "kl": 0.12646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0124,
+      "num_tokens": 346521896.0,
+      "reward": 1.5575894117355347,
+      "reward_std": 0.19927051663398743,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5575893521308899,
+      "rewards/curriculum_aware_reward_fn/std": 0.38783350586891174,
+      "step": 2869
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2798.0,
+      "completions/mean_length": 924.8035888671875,
+      "completions/mean_terminated_length": 896.2342529296875,
+      "completions/min_length": 288.0,
+      "completions/min_terminated_length": 288.0,
+      "epoch": 2.961568222852721,
+      "grad_norm": 0.5695756077766418,
+      "kl": 0.1204833984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0135,
+      "num_tokens": 346698218.0,
+      "reward": 1.4745537042617798,
+      "reward_std": 0.205791175365448,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47455358505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.4199954569339752,
+      "step": 2870
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3622.0,
+      "completions/max_terminated_length": 3622.0,
+      "completions/mean_length": 910.5625610351562,
+      "completions/mean_terminated_length": 910.5625610351562,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.9625999484137218,
+      "grad_norm": 0.5911626815795898,
+      "kl": 0.130615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.035,
+      "num_tokens": 346872168.0,
+      "reward": 1.4625000953674316,
+      "reward_std": 0.24348336458206177,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47142860293388367,
+      "rewards/curriculum_aware_reward_fn/std": 0.36028730869293213,
+      "step": 2871
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3325.0,
+      "completions/mean_length": 850.607177734375,
+      "completions/mean_terminated_length": 821.369384765625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 2.9636316739747226,
+      "grad_norm": 0.5764513611793518,
+      "kl": 0.12646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0217,
+      "num_tokens": 347033201.0,
+      "reward": 1.5437501668930054,
+      "reward_std": 0.26699310541152954,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5616071820259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.44716235995292664,
+      "step": 2872
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3913.0,
+      "completions/mean_length": 1122.884033203125,
+      "completions/mean_terminated_length": 1041.0550537109375,
+      "completions/min_length": 320.0,
+      "completions/min_terminated_length": 320.0,
+      "epoch": 2.9646633995357234,
+      "grad_norm": 0.6206802129745483,
+      "kl": 0.130615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 347237056.0,
+      "reward": 1.6232144832611084,
+      "reward_std": 0.21191005408763885,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6321429014205933,
+      "rewards/curriculum_aware_reward_fn/std": 0.5180028080940247,
+      "step": 2873
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 4092.0,
+      "completions/max_terminated_length": 4092.0,
+      "completions/mean_length": 929.3303833007812,
+      "completions/mean_terminated_length": 929.3303833007812,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 2.9656951250967243,
+      "grad_norm": 0.500167965888977,
+      "kl": 0.1180419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0013,
+      "num_tokens": 347408621.0,
+      "reward": 1.5196430683135986,
+      "reward_std": 0.18252387642860413,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5196428894996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.4163525402545929,
+      "step": 2874
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3737.0,
+      "completions/mean_length": 942.8214721679688,
+      "completions/mean_terminated_length": 885.4909057617188,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 2.966726850657725,
+      "grad_norm": 0.5842325687408447,
+      "kl": 0.1158447265625,
+      "learning_rate": 1e-06,
+      "loss": -0.0194,
+      "num_tokens": 347585309.0,
+      "reward": 1.4964287281036377,
+      "reward_std": 0.24344106018543243,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.49642854928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.3867579400539398,
+      "step": 2875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3725.0,
+      "completions/mean_length": 990.0714721679688,
+      "completions/mean_terminated_length": 904.5870971679688,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "epoch": 2.967758576218726,
+      "grad_norm": 0.4529733657836914,
+      "kl": 0.1328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0215,
+      "num_tokens": 347772077.0,
+      "reward": 1.6580358743667603,
+      "reward_std": 0.14680266380310059,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6580356955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.530958354473114,
+      "step": 2876
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2847.0,
+      "completions/mean_length": 947.6964721679688,
+      "completions/mean_terminated_length": 919.3333740234375,
+      "completions/min_length": 135.0,
+      "completions/min_terminated_length": 135.0,
+      "epoch": 2.968790301779727,
+      "grad_norm": 0.5367709994316101,
+      "kl": 0.12158203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0267,
+      "num_tokens": 347946119.0,
+      "reward": 1.5232144594192505,
+      "reward_std": 0.20710141956806183,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5321428179740906,
+      "rewards/curriculum_aware_reward_fn/std": 0.36937108635902405,
+      "step": 2877
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2400.0,
+      "completions/mean_length": 1164.02685546875,
+      "completions/mean_terminated_length": 968.5619506835938,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 2.9698220273407276,
+      "grad_norm": 0.5432129502296448,
+      "kl": 0.1143798828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0151,
+      "num_tokens": 348145083.0,
+      "reward": 1.3718750476837158,
+      "reward_std": 0.2610403299331665,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38973215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.35635408759117126,
+      "step": 2878
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3759.0,
+      "completions/mean_length": 971.1428833007812,
+      "completions/mean_terminated_length": 914.3272705078125,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 2.970853752901728,
+      "grad_norm": 0.517999529838562,
+      "kl": 0.1328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0221,
+      "num_tokens": 348315248.0,
+      "reward": 1.4370535612106323,
+      "reward_std": 0.17417265474796295,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.43705353140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.3891979157924652,
+      "step": 2879
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4024.0,
+      "completions/mean_length": 895.1964721679688,
+      "completions/mean_terminated_length": 837.0,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 2.971885478462729,
+      "grad_norm": 0.5721215605735779,
+      "kl": 0.13330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0032,
+      "num_tokens": 348487785.0,
+      "reward": 1.5169644355773926,
+      "reward_std": 0.23866644501686096,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.516964316368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.4103148579597473,
+      "step": 2880
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3408.0,
+      "completions/mean_length": 931.08935546875,
+      "completions/mean_terminated_length": 873.54541015625,
+      "completions/min_length": 239.0,
+      "completions/min_terminated_length": 239.0,
+      "epoch": 2.9729172040237297,
+      "grad_norm": 0.5178924798965454,
+      "kl": 0.12646484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0215,
+      "num_tokens": 348654164.0,
+      "reward": 1.5508930683135986,
+      "reward_std": 0.23712819814682007,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.1330273300409317,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5687499642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.4166644215583801,
+      "step": 2881
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3908.0,
+      "completions/mean_length": 1176.634033203125,
+      "completions/mean_terminated_length": 982.0095825195312,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 2.9739489295847306,
+      "grad_norm": 0.42219457030296326,
+      "kl": 0.10546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0427,
+      "num_tokens": 348850830.0,
+      "reward": 1.4392858743667603,
+      "reward_std": 0.23737432062625885,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4482142925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.47081151604652405,
+      "step": 2882
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4067.0,
+      "completions/mean_length": 1074.0804443359375,
+      "completions/mean_terminated_length": 990.908203125,
+      "completions/min_length": 190.0,
+      "completions/min_terminated_length": 190.0,
+      "epoch": 2.9749806551457314,
+      "grad_norm": 0.48693951964378357,
+      "kl": 0.135498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0446,
+      "num_tokens": 349037571.0,
+      "reward": 1.5375001430511475,
+      "reward_std": 0.2253728061914444,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5464285612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.39925122261047363,
+      "step": 2883
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.017857142857142905,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3462.0,
+      "completions/mean_length": 918.5178833007812,
+      "completions/mean_terminated_length": 860.7454223632812,
+      "completions/min_length": 278.0,
+      "completions/min_terminated_length": 278.0,
+      "epoch": 2.976012380706732,
+      "grad_norm": 0.6096733212471008,
+      "kl": 0.13134765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0096,
+      "num_tokens": 349209437.0,
+      "reward": 1.555803656578064,
+      "reward_std": 0.21068914234638214,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5558035969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.4252580404281616,
+      "step": 2884
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.044642857142857095,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3727.0,
+      "completions/mean_length": 1129.982177734375,
+      "completions/mean_terminated_length": 991.3831176757812,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 2.9770441062677326,
+      "grad_norm": 0.5170513391494751,
+      "kl": 0.124755859375,
+      "learning_rate": 1e-06,
+      "loss": -0.0205,
+      "num_tokens": 349406653.0,
+      "reward": 1.4973214864730835,
+      "reward_std": 0.2478838711977005,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5062500238418579,
+      "rewards/curriculum_aware_reward_fn/std": 0.4440357983112335,
+      "step": 2885
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0357142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4037.0,
+      "completions/mean_length": 1047.7679443359375,
+      "completions/mean_terminated_length": 934.870361328125,
+      "completions/min_length": 273.0,
+      "completions/min_terminated_length": 273.0,
+      "epoch": 2.9780758318287335,
+      "grad_norm": 0.6094416379928589,
+      "kl": 0.1243896484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0052,
+      "num_tokens": 349590188.0,
+      "reward": 1.4638394117355347,
+      "reward_std": 0.22185523808002472,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4638392925262451,
+      "rewards/curriculum_aware_reward_fn/std": 0.41471850872039795,
+      "step": 2886
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.044642857142857095,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3679.0,
+      "completions/mean_length": 1009.02685546875,
+      "completions/mean_terminated_length": 864.7756958007812,
+      "completions/min_length": 218.0,
+      "completions/min_terminated_length": 218.0,
+      "epoch": 2.9791075573897343,
+      "grad_norm": 0.4644887447357178,
+      "kl": 0.1251220703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0105,
+      "num_tokens": 349766864.0,
+      "reward": 1.606696605682373,
+      "reward_std": 0.18360301852226257,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6066964268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.45329102873802185,
+      "step": 2887
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4073.0,
+      "completions/mean_length": 974.77685546875,
+      "completions/mean_terminated_length": 888.8715209960938,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 2.980139282950735,
+      "grad_norm": 0.4979686439037323,
+      "kl": 0.1282958984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0453,
+      "num_tokens": 349941783.0,
+      "reward": 1.6200894117355347,
+      "reward_std": 0.1684245765209198,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6200892329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.4342571496963501,
+      "step": 2888
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3619.0,
+      "completions/mean_length": 1229.482177734375,
+      "completions/mean_terminated_length": 1150.587158203125,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "epoch": 2.981171008511736,
+      "grad_norm": 0.47304999828338623,
+      "kl": 0.11767578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0021,
+      "num_tokens": 350158060.0,
+      "reward": 1.4955357313156128,
+      "reward_std": 0.2193337231874466,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4955357611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.3761201798915863,
+      "step": 2889
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3940.0,
+      "completions/mean_length": 998.1339721679688,
+      "completions/mean_terminated_length": 912.8715209960938,
+      "completions/min_length": 104.0,
+      "completions/min_terminated_length": 104.0,
+      "epoch": 2.982202734072737,
+      "grad_norm": 0.5146641135215759,
+      "kl": 0.12060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0306,
+      "num_tokens": 350327566.0,
+      "reward": 1.5937501192092896,
+      "reward_std": 0.216999813914299,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.59375,
+      "rewards/curriculum_aware_reward_fn/std": 0.39135506749153137,
+      "step": 2890
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0535714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3877.0,
+      "completions/mean_length": 1154.3035888671875,
+      "completions/mean_terminated_length": 987.79248046875,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 2.9832344596337377,
+      "grad_norm": 0.45033833384513855,
+      "kl": 0.117919921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0186,
+      "num_tokens": 350530831.0,
+      "reward": 1.4026787281036377,
+      "reward_std": 0.1656990796327591,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41160717606544495,
+      "rewards/curriculum_aware_reward_fn/std": 0.42838382720947266,
+      "step": 2891
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3993.0,
+      "completions/mean_length": 1066.4285888671875,
+      "completions/mean_terminated_length": 864.4571533203125,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 2.984266185194738,
+      "grad_norm": 0.5000366568565369,
+      "kl": 0.1221923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0713,
+      "num_tokens": 350718878.0,
+      "reward": 1.591071605682373,
+      "reward_std": 0.22192177176475525,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.3897908926010132,
+      "step": 2892
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3983.0,
+      "completions/mean_length": 1087.821533203125,
+      "completions/mean_terminated_length": 1005.0274658203125,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 2.985297910755739,
+      "grad_norm": 0.4949439465999603,
+      "kl": 0.1258544921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0599,
+      "num_tokens": 350908691.0,
+      "reward": 1.5763394832611084,
+      "reward_std": 0.24737325310707092,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5763393044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.4229327440261841,
+      "step": 2893
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0357142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4035.0,
+      "completions/mean_length": 1050.2857666015625,
+      "completions/mean_terminated_length": 937.4815063476562,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 2.9863296363167398,
+      "grad_norm": 0.3857535123825073,
+      "kl": 0.11279296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0305,
+      "num_tokens": 351087601.0,
+      "reward": 1.4616073369979858,
+      "reward_std": 0.1593252718448639,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4705357253551483,
+      "rewards/curriculum_aware_reward_fn/std": 0.458677738904953,
+      "step": 2894
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3312.0,
+      "completions/mean_length": 916.4107666015625,
+      "completions/mean_terminated_length": 887.7658081054688,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 2.9873613618777406,
+      "grad_norm": 0.5426157712936401,
+      "kl": 0.13037109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0921,
+      "num_tokens": 351259917.0,
+      "reward": 1.533482313156128,
+      "reward_std": 0.2218896448612213,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4264989197254181,
+      "step": 2895
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3318.0,
+      "completions/mean_length": 940.4553833007812,
+      "completions/mean_terminated_length": 853.60546875,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 2.9883930874387414,
+      "grad_norm": 0.5972111225128174,
+      "kl": 0.1295166015625,
+      "learning_rate": 1e-06,
+      "loss": -0.0023,
+      "num_tokens": 351431013.0,
+      "reward": 1.4687501192092896,
+      "reward_std": 0.19047865271568298,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46875,
+      "rewards/curriculum_aware_reward_fn/std": 0.4237395226955414,
+      "step": 2896
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0535714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3235.0,
+      "completions/mean_length": 1138.3660888671875,
+      "completions/mean_terminated_length": 970.9528198242188,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 2.989424812999742,
+      "grad_norm": 0.5497664213180542,
+      "kl": 0.1187744140625,
+      "learning_rate": 1e-06,
+      "loss": -0.047,
+      "num_tokens": 351632073.0,
+      "reward": 1.4107143878936768,
+      "reward_std": 0.24470612406730652,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.35134419798851013,
+      "step": 2897
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3804.0,
+      "completions/max_terminated_length": 3804.0,
+      "completions/mean_length": 744.1428833007812,
+      "completions/mean_terminated_length": 744.1428833007812,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 2.9904565385607427,
+      "grad_norm": 0.6259168386459351,
+      "kl": 0.130126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.037,
+      "num_tokens": 351781336.0,
+      "reward": 1.4834822416305542,
+      "reward_std": 0.25032472610473633,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.48348215222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.41260480880737305,
+      "step": 2898
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2275.0,
+      "completions/max_terminated_length": 2275.0,
+      "completions/mean_length": 776.482177734375,
+      "completions/mean_terminated_length": 776.482177734375,
+      "completions/min_length": 169.0,
+      "completions/min_terminated_length": 169.0,
+      "epoch": 2.9914882641217435,
+      "grad_norm": 0.4901755750179291,
+      "kl": 0.1298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0476,
+      "num_tokens": 351937238.0,
+      "reward": 1.4642857313156128,
+      "reward_std": 0.1534547507762909,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4732142388820648,
+      "rewards/curriculum_aware_reward_fn/std": 0.44615110754966736,
+      "step": 2899
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2774.0,
+      "completions/max_terminated_length": 2774.0,
+      "completions/mean_length": 758.8660888671875,
+      "completions/mean_terminated_length": 758.8660888671875,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 2.9925199896827444,
+      "grad_norm": 0.6221413612365723,
+      "kl": 0.1455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 352083523.0,
+      "reward": 1.4660714864730835,
+      "reward_std": 0.22209754586219788,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.46607139706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.43904146552085876,
+      "step": 2900
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2544.0,
+      "completions/mean_length": 846.5357666015625,
+      "completions/mean_terminated_length": 817.2612915039062,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.993551715243745,
+      "grad_norm": 0.5860307216644287,
+      "kl": 0.1160888671875,
+      "learning_rate": 1e-06,
+      "loss": -0.0611,
+      "num_tokens": 352247390.0,
+      "reward": 1.5151787996292114,
+      "reward_std": 0.20008236169815063,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5151785612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.416594922542572,
+      "step": 2901
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0267857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3071.0,
+      "completions/mean_length": 966.8035888671875,
+      "completions/mean_terminated_length": 880.6788330078125,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 2.994583440804746,
+      "grad_norm": 0.5898165702819824,
+      "kl": 0.137451171875,
+      "learning_rate": 1e-06,
+      "loss": -0.0341,
+      "num_tokens": 352413310.0,
+      "reward": 1.6013394594192505,
+      "reward_std": 0.2658516764640808,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6013393402099609,
+      "rewards/curriculum_aware_reward_fn/std": 0.40959545969963074,
+      "step": 2902
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3570.0,
+      "completions/mean_length": 728.4553833007812,
+      "completions/mean_terminated_length": 698.1171264648438,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 2.995615166365747,
+      "grad_norm": 0.6526123881340027,
+      "kl": 0.130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.04,
+      "num_tokens": 352562685.0,
+      "reward": 1.6022323369979858,
+      "reward_std": 0.20173947513103485,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6022320985794067,
+      "rewards/curriculum_aware_reward_fn/std": 0.40810418128967285,
+      "step": 2903
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 4024.0,
+      "completions/max_terminated_length": 4024.0,
+      "completions/mean_length": 784.669677734375,
+      "completions/mean_terminated_length": 784.669677734375,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 2.9966468919267477,
+      "grad_norm": 0.582514762878418,
+      "kl": 0.1290283203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 352714296.0,
+      "reward": 1.5281251668930054,
+      "reward_std": 0.17819184064865112,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.528124988079071,
+      "rewards/curriculum_aware_reward_fn/std": 0.39385947585105896,
+      "step": 2904
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2639.0,
+      "completions/mean_length": 700.7857666015625,
+      "completions/mean_terminated_length": 670.1982421875,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 2.997678617487748,
+      "grad_norm": 0.6107363700866699,
+      "kl": 0.1234130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0158,
+      "num_tokens": 352856750.0,
+      "reward": 1.588392972946167,
+      "reward_std": 0.2505446672439575,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09449111670255661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5973213911056519,
+      "rewards/curriculum_aware_reward_fn/std": 0.4182136654853821,
+      "step": 2905
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.008928571428571397,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3624.0,
+      "completions/mean_length": 772.169677734375,
+      "completions/mean_terminated_length": 742.2252197265625,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 2.998710343048749,
+      "grad_norm": 0.6734402179718018,
+      "kl": 0.1328125,
+      "learning_rate": 1e-06,
+      "loss": -0.0523,
+      "num_tokens": 353016229.0,
+      "reward": 1.3513394594192505,
+      "reward_std": 0.22044141590595245,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35133928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.4162498712539673,
+      "step": 2906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2167.0,
+      "completions/max_terminated_length": 2167.0,
+      "completions/mean_length": 748.3200073242188,
+      "completions/mean_terminated_length": 748.3200073242188,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "epoch": 2.99974206860975,
+      "grad_norm": 0.6013002991676331,
+      "kl": 0.1295166015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0093,
+      "num_tokens": 353173097.0,
+      "reward": 1.3821431398391724,
+      "reward_std": 0.18830367922782898,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709,
+      "rewards/curriculum_aware_reward_fn/std": 0.3469855487346649,
+      "step": 2907
+    },
+    {
+      "epoch": 2.99974206860975,
+      "step": 2907,
+      "total_flos": 0.0,
+      "train_loss": 0.015996046145483354,
+      "train_runtime": 176359.9042,
+      "train_samples_per_second": 0.264,
+      "train_steps_per_second": 0.016
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2907,
+  "num_input_tokens_seen": 353173097,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}