diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,78532 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.99974206860975, + "eval_steps": 500, + "global_step": 2907, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1169.0, + "completions/max_terminated_length": 1169.0, + "completions/mean_length": 494.0625305175781, + "completions/mean_terminated_length": 494.0625305175781, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.0010317255610007739, + "grad_norm": 1.318339467048645, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": 0.0159, + "num_tokens": 116746.0, + "reward": 0.28214287757873535, + "reward_std": 0.4351659417152405, + "rewards/code_format_reward/mean": 0.2321428507566452, + "rewards/code_format_reward/std": 0.4240971803665161, + "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806, + "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3506.0, + "completions/max_terminated_length": 3506.0, + "completions/mean_length": 516.8660888671875, + "completions/mean_terminated_length": 516.8660888671875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.0020634511220015478, + "grad_norm": 1.158948302268982, + "kl": 0.0006999969482421875, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 238349.0, + "reward": 0.5758929252624512, + "reward_std": 0.47179263830184937, + "rewards/code_format_reward/mean": 0.4821428656578064, + "rewards/code_format_reward/std": 0.5019267797470093, + "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1131.0, + "completions/max_terminated_length": 1131.0, + "completions/mean_length": 424.33929443359375, + "completions/mean_terminated_length": 424.33929443359375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.003095176683002321, + "grad_norm": 1.1996846199035645, + "kl": 0.004955291748046875, + "learning_rate": 1e-06, + "loss": 0.0258, + "num_tokens": 356799.0, + "reward": 0.8459821939468384, + "reward_std": 0.3950677216053009, + "rewards/code_format_reward/mean": 0.7678571343421936, + "rewards/code_format_reward/std": 0.4240972101688385, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1900.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 483.3482360839844, + "completions/mean_terminated_length": 483.3482360839844, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.0041269022440030955, + "grad_norm": 0.8876407146453857, + "kl": 0.00455474853515625, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 481704.0, + "reward": 0.9250000715255737, + "reward_std": 0.2708400785923004, + "rewards/code_format_reward/mean": 0.875, + "rewards/code_format_reward/std": 0.33220529556274414, + "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806, + "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 445.794677734375, + "completions/mean_terminated_length": 445.794677734375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.005158627805003869, + "grad_norm": 0.6923364996910095, + "kl": 0.00545501708984375, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 585043.0, + "reward": 1.0, + "reward_std": 0.17492596805095673, + "rewards/code_format_reward/mean": 0.9375, + "rewards/code_format_reward/std": 0.24314938485622406, + "rewards/curriculum_aware_reward_fn/mean": 0.0625, + "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 486.4464416503906, + "completions/mean_terminated_length": 486.4464416503906, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.006190353366004642, + "grad_norm": 0.5436791777610779, + "kl": 0.0048828125, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 708685.0, + "reward": 1.0040178298950195, + "reward_std": 0.0781477838754654, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.02187499962747097, + "rewards/curriculum_aware_reward_fn/std": 0.08510228246450424, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 429.01788330078125, + "completions/mean_terminated_length": 429.01788330078125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.007222078927005417, + "grad_norm": 0.7159688472747803, + "kl": 0.00606536865234375, + "learning_rate": 1e-06, + "loss": 0.0251, + "num_tokens": 819672.0, + "reward": 1.0669643878936768, + "reward_std": 0.12266332656145096, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 455.6339416503906, + "completions/mean_terminated_length": 455.6339416503906, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.008253804488006191, + "grad_norm": 0.6650537252426147, + "kl": 0.006988525390625, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 934536.0, + "reward": 1.0316965579986572, + "reward_std": 0.08898404985666275, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.04062500223517418, + "rewards/curriculum_aware_reward_fn/std": 0.11261255294084549, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1310.0, + "completions/max_terminated_length": 1310.0, + "completions/mean_length": 413.3035888671875, + "completions/mean_terminated_length": 413.3035888671875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.009285530049006964, + "grad_norm": 0.743399441242218, + "kl": 0.00730133056640625, + "learning_rate": 1e-06, + "loss": 0.0313, + "num_tokens": 1041565.0, + "reward": 1.1218750476837158, + "reward_std": 0.10294599086046219, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1143.0, + "completions/max_terminated_length": 1143.0, + "completions/mean_length": 436.4285888671875, + "completions/mean_terminated_length": 436.4285888671875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.010317255610007738, + "grad_norm": 0.6795259118080139, + "kl": 0.007244110107421875, + "learning_rate": 1e-06, + "loss": 0.0189, + "num_tokens": 1154796.0, + "reward": 1.1071429252624512, + "reward_std": 0.11945624649524689, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 481.3214416503906, + "completions/mean_terminated_length": 481.3214416503906, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.011348981171008512, + "grad_norm": 0.5997210741043091, + "kl": 0.005950927734375, + "learning_rate": 1e-06, + "loss": 0.0182, + "num_tokens": 1270556.0, + "reward": 1.054464340209961, + "reward_std": 0.1270672082901001, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14843402802944183, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 443.15179443359375, + "completions/mean_terminated_length": 443.15179443359375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.012380706732009285, + "grad_norm": 0.6722776293754578, + "kl": 0.00591278076171875, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 1385646.0, + "reward": 1.0593750476837158, + "reward_std": 0.08607304841279984, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1456.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 550.3125, + "completions/mean_terminated_length": 550.3125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.013412432293010059, + "grad_norm": 0.6148671507835388, + "kl": 0.00577545166015625, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 1514299.0, + "reward": 1.0660715103149414, + "reward_std": 0.10993208736181259, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1899.0, + "completions/max_terminated_length": 1899.0, + "completions/mean_length": 564.6339721679688, + "completions/mean_terminated_length": 564.6339721679688, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.014444157854010833, + "grad_norm": 0.6636642813682556, + "kl": 0.0090484619140625, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 1654104.0, + "reward": 0.9424107670783997, + "reward_std": 0.23124831914901733, + "rewards/code_format_reward/mean": 0.9017857313156128, + "rewards/code_format_reward/std": 0.2989417314529419, + "rewards/curriculum_aware_reward_fn/mean": 0.04062500223517418, + "rewards/curriculum_aware_reward_fn/std": 0.11261255294084549, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 467.96429443359375, + "completions/mean_terminated_length": 467.96429443359375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.015475883415011608, + "grad_norm": 0.542773962020874, + "kl": 0.00925445556640625, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 1775382.0, + "reward": 1.041517972946167, + "reward_std": 0.07933254539966583, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499925494194, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 417.71429443359375, + "completions/mean_terminated_length": 417.71429443359375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.016507608976012382, + "grad_norm": 0.6454997062683105, + "kl": 0.0098419189453125, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 1889737.0, + "reward": 1.0732142925262451, + "reward_std": 0.13696174323558807, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 518.25, + "completions/mean_terminated_length": 518.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.017539334537013153, + "grad_norm": 0.49701130390167236, + "kl": 0.00626373291015625, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 2026720.0, + "reward": 1.056249976158142, + "reward_std": 0.06265628337860107, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1654.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 569.5, + "completions/mean_terminated_length": 569.5, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.018571060098013927, + "grad_norm": 0.4273010492324829, + "kl": 0.00580596923828125, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 2159447.0, + "reward": 0.9883929491043091, + "reward_std": 0.0637814998626709, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.0062500000931322575, + "rewards/curriculum_aware_reward_fn/std": 0.04655956104397774, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1708.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 495.89288330078125, + "completions/mean_terminated_length": 495.89288330078125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.0196027856590147, + "grad_norm": 0.5998135805130005, + "kl": 0.0064849853515625, + "learning_rate": 1e-06, + "loss": -0.0182, + "num_tokens": 2282838.0, + "reward": 1.0848214626312256, + "reward_std": 0.10725849866867065, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1220.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 477.0714416503906, + "completions/mean_terminated_length": 477.0714416503906, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.020634511220015476, + "grad_norm": 0.7616762518882751, + "kl": 0.00719451904296875, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 2399480.0, + "reward": 1.079017996788025, + "reward_std": 0.15464720129966736, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836, + "rewards/curriculum_aware_reward_fn/std": 0.17709888517856598, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 448.40179443359375, + "completions/mean_terminated_length": 448.40179443359375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.02166623678101625, + "grad_norm": 0.5755221843719482, + "kl": 0.00826263427734375, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 2516544.0, + "reward": 1.1062500476837158, + "reward_std": 0.06990548223257065, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2976.0, + "completions/max_terminated_length": 2976.0, + "completions/mean_length": 522.8214721679688, + "completions/mean_terminated_length": 522.8214721679688, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.022697962342017024, + "grad_norm": 0.6775954961776733, + "kl": 0.007049560546875, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 2646255.0, + "reward": 1.1031250953674316, + "reward_std": 0.10498352348804474, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1326.0, + "completions/max_terminated_length": 1326.0, + "completions/mean_length": 466.6607360839844, + "completions/mean_terminated_length": 466.6607360839844, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.0237296879030178, + "grad_norm": 0.6193450093269348, + "kl": 0.00760650634765625, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 2770201.0, + "reward": 1.1125001907348633, + "reward_std": 0.07438036799430847, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 439.4285888671875, + "completions/mean_terminated_length": 439.4285888671875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.02476141346401857, + "grad_norm": 0.5675091743469238, + "kl": 0.00740814208984375, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 2885042.0, + "reward": 1.1062500476837158, + "reward_std": 0.06854972243309021, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1473.0, + "completions/max_terminated_length": 1473.0, + "completions/mean_length": 513.9375, + "completions/mean_terminated_length": 513.9375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.025793139025019344, + "grad_norm": 0.6651517152786255, + "kl": 0.0066986083984375, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 3012092.0, + "reward": 1.0848215818405151, + "reward_std": 0.11312052607536316, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 467.5000305175781, + "completions/mean_terminated_length": 467.5000305175781, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.026824864586020118, + "grad_norm": 0.46935516595840454, + "kl": 0.00611114501953125, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 3133073.0, + "reward": 1.0593750476837158, + "reward_std": 0.04615173488855362, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 427.3214416503906, + "completions/mean_terminated_length": 427.3214416503906, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.027856590147020893, + "grad_norm": 0.7585523724555969, + "kl": 0.00814056396484375, + "learning_rate": 1e-06, + "loss": 0.0275, + "num_tokens": 3249883.0, + "reward": 1.125000238418579, + "reward_std": 0.09292245656251907, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 489.70538330078125, + "completions/mean_terminated_length": 489.70538330078125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.028888315708021667, + "grad_norm": 0.4714300334453583, + "kl": 0.00653076171875, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 3378412.0, + "reward": 1.053125023841858, + "reward_std": 0.04717051237821579, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582, + "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1270.0, + "completions/max_terminated_length": 1270.0, + "completions/mean_length": 452.8035888671875, + "completions/mean_terminated_length": 452.8035888671875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.02992004126902244, + "grad_norm": 0.6367431282997131, + "kl": 0.00838470458984375, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 3495220.0, + "reward": 1.0875000953674316, + "reward_std": 0.08884736895561218, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 449.76788330078125, + "completions/mean_terminated_length": 449.76788330078125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.030951766830023215, + "grad_norm": 0.6237981915473938, + "kl": 0.009368896484375, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 3608552.0, + "reward": 1.1031250953674316, + "reward_std": 0.08083000034093857, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499850988388, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1214.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 486.21429443359375, + "completions/mean_terminated_length": 486.21429443359375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.031983492391023986, + "grad_norm": 0.5921357274055481, + "kl": 0.0074462890625, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 3732091.0, + "reward": 1.0750000476837158, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 461.357177734375, + "completions/mean_terminated_length": 461.357177734375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.033015217952024764, + "grad_norm": 0.698621392250061, + "kl": 0.00798797607421875, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 3848961.0, + "reward": 1.1156251430511475, + "reward_std": 0.08847898989915848, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 456.732177734375, + "completions/mean_terminated_length": 456.732177734375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.034046943513025535, + "grad_norm": 0.6826543211936951, + "kl": 0.008026123046875, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 3961183.0, + "reward": 1.1156251430511475, + "reward_std": 0.08746020495891571, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1612.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 492.232177734375, + "completions/mean_terminated_length": 492.232177734375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.035078669074026306, + "grad_norm": 0.5144251585006714, + "kl": 0.00841522216796875, + "learning_rate": 1e-06, + "loss": -0.0074, + "num_tokens": 4077941.0, + "reward": 1.0687501430511475, + "reward_std": 0.06228790059685707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06874999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 500.1785888671875, + "completions/mean_terminated_length": 500.1785888671875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.036110394635027084, + "grad_norm": 0.4933493733406067, + "kl": 0.009246826171875, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 4199754.0, + "reward": 1.068750023841858, + "reward_std": 0.05096360296010971, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 478.58038330078125, + "completions/mean_terminated_length": 478.58038330078125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.037142120196027854, + "grad_norm": 0.5227413177490234, + "kl": 0.00762176513671875, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 4326365.0, + "reward": 1.0812500715255737, + "reward_std": 0.07092425972223282, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 412.9821472167969, + "completions/mean_terminated_length": 412.9821472167969, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.03817384575702863, + "grad_norm": 0.6366561651229858, + "kl": 0.01012420654296875, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 4444495.0, + "reward": 1.0937501192092896, + "reward_std": 0.08746020495891571, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 428.6875305175781, + "completions/mean_terminated_length": 428.6875305175781, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.0392055713180294, + "grad_norm": 0.7173018455505371, + "kl": 0.01025390625, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 4551553.0, + "reward": 1.1187500953674316, + "reward_std": 0.08847897499799728, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 469.8125305175781, + "completions/mean_terminated_length": 469.8125305175781, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.04023729687903018, + "grad_norm": 0.679751992225647, + "kl": 0.0091400146484375, + "learning_rate": 1e-06, + "loss": 0.0217, + "num_tokens": 4676041.0, + "reward": 1.0656250715255737, + "reward_std": 0.08363571017980576, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.13722330331802368, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 497.5000305175781, + "completions/mean_terminated_length": 497.5000305175781, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.04126902244003095, + "grad_norm": 0.5930312871932983, + "kl": 0.00982666015625, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 4799416.0, + "reward": 1.0531251430511475, + "reward_std": 0.07641790807247162, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582, + "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 410.6607360839844, + "completions/mean_terminated_length": 410.6607360839844, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.04230074800103172, + "grad_norm": 0.7011821866035461, + "kl": 0.0117645263671875, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 4907101.0, + "reward": 1.109375, + "reward_std": 0.09430962055921555, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 432.2589416503906, + "completions/mean_terminated_length": 432.2589416503906, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.0433324735620325, + "grad_norm": 0.40348029136657715, + "kl": 0.0109710693359375, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 5025192.0, + "reward": 1.0687501430511475, + "reward_std": 0.022366588935256004, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 409.8571472167969, + "completions/mean_terminated_length": 409.8571472167969, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.04436419912303327, + "grad_norm": 0.7503907680511475, + "kl": 0.0126495361328125, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 5140459.0, + "reward": 1.1031250953674316, + "reward_std": 0.09394122660160065, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499850988388, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 426.8750305175781, + "completions/mean_terminated_length": 426.8750305175781, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.04539592468403405, + "grad_norm": 0.5602256655693054, + "kl": 0.011505126953125, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 5252585.0, + "reward": 1.0723215341567993, + "reward_std": 0.07662393152713776, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 401.7946472167969, + "completions/mean_terminated_length": 401.7946472167969, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.04642765024503482, + "grad_norm": 0.6367837190628052, + "kl": 0.011199951171875, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 5355728.0, + "reward": 1.078125, + "reward_std": 0.07434897124767303, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 388.1607360839844, + "completions/mean_terminated_length": 388.1607360839844, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.0474593758060356, + "grad_norm": 0.6154026389122009, + "kl": 0.014129638671875, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 5469559.0, + "reward": 1.0875000953674316, + "reward_std": 0.05784441903233528, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 1075.0, + "completions/mean_length": 408.83038330078125, + "completions/mean_terminated_length": 408.83038330078125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.04849110136703637, + "grad_norm": 0.7130938768386841, + "kl": 0.0154571533203125, + "learning_rate": 1e-06, + "loss": 0.0225, + "num_tokens": 5587539.0, + "reward": 1.1125000715255737, + "reward_std": 0.07780507206916809, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11249999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 958.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 429.2410888671875, + "completions/mean_terminated_length": 429.2410888671875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.04952282692803714, + "grad_norm": 0.7699652314186096, + "kl": 0.015350341796875, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 5702863.0, + "reward": 1.1031250953674316, + "reward_std": 0.08709181845188141, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1635.0, + "completions/max_terminated_length": 1635.0, + "completions/mean_length": 436.9464416503906, + "completions/mean_terminated_length": 436.9464416503906, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.05055455248903792, + "grad_norm": 0.4649677574634552, + "kl": 0.0119171142578125, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 5813534.0, + "reward": 1.03125, + "reward_std": 0.04232724383473396, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.03125, + "rewards/curriculum_aware_reward_fn/std": 0.10025305300951004, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 442.58929443359375, + "completions/mean_terminated_length": 442.58929443359375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.05158627805003869, + "grad_norm": 0.5999484062194824, + "kl": 0.0105743408203125, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 5941763.0, + "reward": 1.0625, + "reward_std": 0.06025035306811333, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0625, + "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 379.1964416503906, + "completions/mean_terminated_length": 379.1964416503906, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.052618003611039466, + "grad_norm": 0.43735066056251526, + "kl": 0.013519287109375, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 6050796.0, + "reward": 1.068750023841858, + "reward_std": 0.03890253230929375, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 413.6785888671875, + "completions/mean_terminated_length": 413.6785888671875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.053649729172040236, + "grad_norm": 0.7055557370185852, + "kl": 0.0141754150390625, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 6158672.0, + "reward": 1.0843751430511475, + "reward_std": 0.07678630203008652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 438.7321472167969, + "completions/mean_terminated_length": 438.7321472167969, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.054681454733041014, + "grad_norm": 0.5913369655609131, + "kl": 0.0139923095703125, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 6282654.0, + "reward": 1.0531251430511475, + "reward_std": 0.06268768012523651, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582, + "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 371.8750305175781, + "completions/mean_terminated_length": 371.8750305175781, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.055713180294041785, + "grad_norm": 0.5662503242492676, + "kl": 0.016754150390625, + "learning_rate": 1e-06, + "loss": 0.0286, + "num_tokens": 6394360.0, + "reward": 1.0812500715255737, + "reward_std": 0.07777366787195206, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 375.9821472167969, + "completions/mean_terminated_length": 375.9821472167969, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.056744905855042556, + "grad_norm": 0.7322971224784851, + "kl": 0.0171356201171875, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 6508633.0, + "reward": 1.1281250715255737, + "reward_std": 0.09230346977710724, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 379.1964416503906, + "completions/mean_terminated_length": 379.1964416503906, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.057776631416043334, + "grad_norm": 0.5264835953712463, + "kl": 0.01763916015625, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 6619143.0, + "reward": 1.1187502145767212, + "reward_std": 0.0671311616897583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.19114695489406586, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 414.58929443359375, + "completions/mean_terminated_length": 414.58929443359375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.058808356977044104, + "grad_norm": 0.4410315752029419, + "kl": 0.0155792236328125, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 6727877.0, + "reward": 1.0875000953674316, + "reward_std": 0.03165333718061447, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 364.95538330078125, + "completions/mean_terminated_length": 364.95538330078125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.05984008253804488, + "grad_norm": 0.6999755501747131, + "kl": 0.01708984375, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 6832991.0, + "reward": 1.0906251668930054, + "reward_std": 0.07984261959791183, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 381.3839416503906, + "completions/mean_terminated_length": 381.3839416503906, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.06087180809904565, + "grad_norm": 0.49779650568962097, + "kl": 0.0159149169921875, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 6942577.0, + "reward": 1.1031250953674316, + "reward_std": 0.05744463577866554, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499850988388, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 346.0000305175781, + "completions/mean_terminated_length": 346.0000305175781, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.06190353366004643, + "grad_norm": 0.6053861379623413, + "kl": 0.017669677734375, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 7042495.0, + "reward": 1.1468751430511475, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 426.1875305175781, + "completions/mean_terminated_length": 426.1875305175781, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.0629352592210472, + "grad_norm": 0.49604561924934387, + "kl": 0.01611328125, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 7159248.0, + "reward": 1.109375, + "reward_std": 0.05300115421414375, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 386.40179443359375, + "completions/mean_terminated_length": 386.40179443359375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.06396698478204797, + "grad_norm": 0.7982628345489502, + "kl": 0.019561767578125, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 7274324.0, + "reward": 1.093750238418579, + "reward_std": 0.09779711812734604, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1083.0, + "completions/max_terminated_length": 1083.0, + "completions/mean_length": 393.02679443359375, + "completions/mean_terminated_length": 393.02679443359375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.06499871034304874, + "grad_norm": 0.703448474407196, + "kl": 0.01800537109375, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 7392375.0, + "reward": 1.0937501192092896, + "reward_std": 0.07157464325428009, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 369.0357360839844, + "completions/mean_terminated_length": 369.0357360839844, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.06603043590404953, + "grad_norm": 0.7088890075683594, + "kl": 0.01617431640625, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 7503448.0, + "reward": 1.0906251668930054, + "reward_std": 0.09773432463407516, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 385.58929443359375, + "completions/mean_terminated_length": 385.58929443359375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.0670621614650503, + "grad_norm": 0.6222151517868042, + "kl": 0.0174560546875, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 7612679.0, + "reward": 1.078125, + "reward_std": 0.05988196283578873, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 420.6339416503906, + "completions/mean_terminated_length": 420.6339416503906, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.06809388702605107, + "grad_norm": 0.691880464553833, + "kl": 0.018096923828125, + "learning_rate": 1e-06, + "loss": -0.0125, + "num_tokens": 7725625.0, + "reward": 1.1437500715255737, + "reward_std": 0.08505426347255707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.21299293637275696, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1575.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 431.6964416503906, + "completions/mean_terminated_length": 431.6964416503906, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.06912561258705184, + "grad_norm": 0.6876183152198792, + "kl": 0.0158538818359375, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 7832768.0, + "reward": 1.109375, + "reward_std": 0.1025775894522667, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 392.15179443359375, + "completions/mean_terminated_length": 392.15179443359375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.07015733814805261, + "grad_norm": 0.7046939134597778, + "kl": 0.017791748046875, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 7945079.0, + "reward": 1.1187500953674316, + "reward_std": 0.0778050646185875, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 356.5446472167969, + "completions/mean_terminated_length": 356.5446472167969, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.0711890637090534, + "grad_norm": 0.797570526599884, + "kl": 0.02020263671875, + "learning_rate": 1e-06, + "loss": 0.0295, + "num_tokens": 8048810.0, + "reward": 1.1343750953674316, + "reward_std": 0.09572817385196686, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 365.7321472167969, + "completions/mean_terminated_length": 365.7321472167969, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.07222078927005417, + "grad_norm": 0.7524502277374268, + "kl": 0.023345947265625, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 8163741.0, + "reward": 1.1062500476837158, + "reward_std": 0.09088490903377533, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 392.5446472167969, + "completions/mean_terminated_length": 392.5446472167969, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.07325251483105494, + "grad_norm": 0.6227968335151672, + "kl": 0.020721435546875, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 8274094.0, + "reward": 1.0848214626312256, + "reward_std": 0.08387312293052673, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 397.6339416503906, + "completions/mean_terminated_length": 397.6339416503906, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.07428424039205571, + "grad_norm": 0.6036502122879028, + "kl": 0.02008056640625, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 8383855.0, + "reward": 1.1156251430511475, + "reward_std": 0.0671311616897583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 379.2410888671875, + "completions/mean_terminated_length": 379.2410888671875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.0753159659530565, + "grad_norm": 0.7189438343048096, + "kl": 0.02191162109375, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 8506648.0, + "reward": 1.0437500476837158, + "reward_std": 0.08366710692644119, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.04374999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.11627185344696045, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 395.3750305175781, + "completions/mean_terminated_length": 395.3750305175781, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.07634769151405726, + "grad_norm": 0.8699377179145813, + "kl": 0.024932861328125, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 8623345.0, + "reward": 1.0316965579986572, + "reward_std": 0.0907539427280426, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.04062500223517418, + "rewards/curriculum_aware_reward_fn/std": 0.11261255294084549, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 369.5357360839844, + "completions/mean_terminated_length": 369.5357360839844, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.07737941707505804, + "grad_norm": 0.6735115647315979, + "kl": 0.02252197265625, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 8737736.0, + "reward": 1.0500000715255737, + "reward_std": 0.06851832568645477, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806, + "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 355.9821472167969, + "completions/mean_terminated_length": 355.9821472167969, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.0784111426360588, + "grad_norm": 0.5873785018920898, + "kl": 0.02325439453125, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 8842349.0, + "reward": 1.0968750715255737, + "reward_std": 0.055438488721847534, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 373.9107360839844, + "completions/mean_terminated_length": 373.9107360839844, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.07944286819705958, + "grad_norm": 0.863136351108551, + "kl": 0.02398681640625, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 8949845.0, + "reward": 1.0906250476837158, + "reward_std": 0.10260899364948273, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 382.4107360839844, + "completions/mean_terminated_length": 382.4107360839844, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.08047459375806036, + "grad_norm": 0.5056124925613403, + "kl": 0.025146484375, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 9048752.0, + "reward": 1.1218750476837158, + "reward_std": 0.053001150488853455, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 423.02679443359375, + "completions/mean_terminated_length": 423.02679443359375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.08150631931906113, + "grad_norm": 0.625390350818634, + "kl": 0.02386474609375, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 9173222.0, + "reward": 1.0718750953674316, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 440.1160888671875, + "completions/mean_terminated_length": 440.1160888671875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.0825380448800619, + "grad_norm": 0.6255428791046143, + "kl": 0.021820068359375, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 9292914.0, + "reward": 1.0906251668930054, + "reward_std": 0.06611238420009613, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 355.95538330078125, + "completions/mean_terminated_length": 355.95538330078125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.08356977044106267, + "grad_norm": 0.850692868232727, + "kl": 0.024566650390625, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 9399526.0, + "reward": 1.0875000953674316, + "reward_std": 0.11288311332464218, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 360.39288330078125, + "completions/mean_terminated_length": 360.39288330078125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.08460149600206344, + "grad_norm": 0.81606525182724, + "kl": 0.028076171875, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 9509540.0, + "reward": 1.1343750953674316, + "reward_std": 0.09674695879220963, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437500596046448, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 366.2232360839844, + "completions/mean_terminated_length": 366.2232360839844, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.08563322156306423, + "grad_norm": 0.6983723640441895, + "kl": 0.02325439453125, + "learning_rate": 1e-06, + "loss": -0.0121, + "num_tokens": 9611618.0, + "reward": 1.0973215103149414, + "reward_std": 0.09658458083868027, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 444.6160888671875, + "completions/mean_terminated_length": 444.6160888671875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.086664947124065, + "grad_norm": 0.59740149974823, + "kl": 0.020477294921875, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 9724940.0, + "reward": 1.0625001192092896, + "reward_std": 0.08261694014072418, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0625, + "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 407.6964416503906, + "completions/mean_terminated_length": 407.6964416503906, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.08769667268506577, + "grad_norm": 0.6780855059623718, + "kl": 0.023468017578125, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 9827526.0, + "reward": 1.0906250476837158, + "reward_std": 0.07678630203008652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 391.5535888671875, + "completions/mean_terminated_length": 391.5535888671875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.08872839824606654, + "grad_norm": 0.6756490468978882, + "kl": 0.019744873046875, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 9936438.0, + "reward": 1.0937501192092896, + "reward_std": 0.08122977614402771, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 383.7321472167969, + "completions/mean_terminated_length": 383.7321472167969, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.08976012380706733, + "grad_norm": 0.5501047968864441, + "kl": 0.02313232421875, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 10049997.0, + "reward": 1.1156251430511475, + "reward_std": 0.0612691231071949, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1270.0, + "completions/max_terminated_length": 1270.0, + "completions/mean_length": 432.8035888671875, + "completions/mean_terminated_length": 432.8035888671875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.0907918493680681, + "grad_norm": 0.6241595149040222, + "kl": 0.019317626953125, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 10163039.0, + "reward": 1.1000001430511475, + "reward_std": 0.06851832568645477, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 403.0357360839844, + "completions/mean_terminated_length": 403.0357360839844, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.09182357492906887, + "grad_norm": 0.6893149018287659, + "kl": 0.0233154296875, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 10276142.0, + "reward": 1.0473215579986572, + "reward_std": 0.11236942559480667, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.05625000223517418, + "rewards/curriculum_aware_reward_fn/std": 0.1374027132987976, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 365.3035888671875, + "completions/mean_terminated_length": 365.3035888671875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.09285530049006964, + "grad_norm": 0.6915412545204163, + "kl": 0.020111083984375, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 10370004.0, + "reward": 1.1218751668930054, + "reward_std": 0.07296179980039597, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 402.0982360839844, + "completions/mean_terminated_length": 402.0982360839844, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.09388702605107041, + "grad_norm": 0.7040698528289795, + "kl": 0.020416259765625, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 10490156.0, + "reward": 1.0500000715255737, + "reward_std": 0.07678629457950592, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806, + "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 396.14288330078125, + "completions/mean_terminated_length": 396.14288330078125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.0949187516120712, + "grad_norm": 0.5344059467315674, + "kl": 0.02386474609375, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 10601045.0, + "reward": 1.071874976158142, + "reward_std": 0.0554070845246315, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 446.169677734375, + "completions/mean_terminated_length": 446.169677734375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.09595047717307197, + "grad_norm": 0.4314137101173401, + "kl": 0.020782470703125, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 10729780.0, + "reward": 1.0531251430511475, + "reward_std": 0.03992130607366562, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582, + "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 394.21429443359375, + "completions/mean_terminated_length": 394.21429443359375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.09698220273407274, + "grad_norm": 0.6997676491737366, + "kl": 0.021209716796875, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 10847187.0, + "reward": 1.0937501192092896, + "reward_std": 0.08057939261198044, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 394.45538330078125, + "completions/mean_terminated_length": 394.45538330078125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.09801392829507351, + "grad_norm": 0.7313223481178284, + "kl": 0.021942138671875, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 10953911.0, + "reward": 1.1687501668930054, + "reward_std": 0.09292246401309967, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 402.9821472167969, + "completions/mean_terminated_length": 402.9821472167969, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.09904565385607428, + "grad_norm": 0.6087774038314819, + "kl": 0.024078369140625, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 11056746.0, + "reward": 1.0500000715255737, + "reward_std": 0.06367506086826324, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806, + "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1242.0, + "completions/max_terminated_length": 1242.0, + "completions/mean_length": 407.1339416503906, + "completions/mean_terminated_length": 407.1339416503906, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.10007737941707506, + "grad_norm": 0.6327843070030212, + "kl": 0.023345947265625, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 11170154.0, + "reward": 1.125, + "reward_std": 0.07296179980039597, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845881938934326, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 403.0535888671875, + "completions/mean_terminated_length": 403.0535888671875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.10110910497807583, + "grad_norm": 0.4795805811882019, + "kl": 0.022491455078125, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 11279031.0, + "reward": 1.1000001430511475, + "reward_std": 0.028228629380464554, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 429.8571472167969, + "completions/mean_terminated_length": 429.8571472167969, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.1021408305390766, + "grad_norm": 0.548716127872467, + "kl": 0.02044677734375, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 11393521.0, + "reward": 1.078125, + "reward_std": 0.06509362161159515, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 419.02679443359375, + "completions/mean_terminated_length": 419.02679443359375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.10317255610007738, + "grad_norm": 0.5905047655105591, + "kl": 0.021240234375, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 11512979.0, + "reward": 1.0718750953674316, + "reward_std": 0.06953709572553635, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1100.0, + "completions/max_terminated_length": 1100.0, + "completions/mean_length": 411.2857360839844, + "completions/mean_terminated_length": 411.2857360839844, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.10420428166107815, + "grad_norm": 0.6256059408187866, + "kl": 0.025115966796875, + "learning_rate": 1e-06, + "loss": -0.0189, + "num_tokens": 11629246.0, + "reward": 1.1156251430511475, + "reward_std": 0.07095565646886826, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 479.294677734375, + "completions/mean_terminated_length": 479.294677734375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.10523600722207893, + "grad_norm": 0.6571237444877625, + "kl": 0.020477294921875, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 11758037.0, + "reward": 1.09375, + "reward_std": 0.08122977614402771, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 421.5446472167969, + "completions/mean_terminated_length": 421.5446472167969, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.1062677327830797, + "grad_norm": 0.5904666185379028, + "kl": 0.02593994140625, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 11871197.0, + "reward": 1.09375, + "reward_std": 0.0612691231071949, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 438.1964416503906, + "completions/mean_terminated_length": 438.1964416503906, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.10729945834408047, + "grad_norm": 0.7135812640190125, + "kl": 0.021026611328125, + "learning_rate": 1e-06, + "loss": 0.0449, + "num_tokens": 11986334.0, + "reward": 1.1218751668930054, + "reward_std": 0.08949775248765945, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 433.6160888671875, + "completions/mean_terminated_length": 433.6160888671875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.10833118390508124, + "grad_norm": 0.748449444770813, + "kl": 0.025787353515625, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 12104731.0, + "reward": 1.09375, + "reward_std": 0.08986613899469376, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 421.3125305175781, + "completions/mean_terminated_length": 421.3125305175781, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.10936290946608203, + "grad_norm": 0.6255056858062744, + "kl": 0.023193359375, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 12216871.0, + "reward": 1.1129465103149414, + "reward_std": 0.09252168238162994, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 431.8035888671875, + "completions/mean_terminated_length": 431.8035888671875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.1103946350270828, + "grad_norm": 0.6440974473953247, + "kl": 0.024749755859375, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 12332132.0, + "reward": 1.1156251430511475, + "reward_std": 0.07296180725097656, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 467.3750305175781, + "completions/mean_terminated_length": 467.3750305175781, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.11142636058808357, + "grad_norm": 0.4589020907878876, + "kl": 0.020294189453125, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 12458314.0, + "reward": 1.0437501668930054, + "reward_std": 0.0364965982735157, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.04374999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.11627185344696045, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 454.83929443359375, + "completions/mean_terminated_length": 454.83929443359375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.11245808614908434, + "grad_norm": 0.6246293783187866, + "kl": 0.022918701171875, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 12573229.0, + "reward": 1.0718750953674316, + "reward_std": 0.07882384210824966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 470.0625305175781, + "completions/mean_terminated_length": 470.0625305175781, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.11348981171008511, + "grad_norm": 0.45937812328338623, + "kl": 0.022705078125, + "learning_rate": 1e-06, + "loss": -0.017, + "num_tokens": 12703625.0, + "reward": 1.1031250953674316, + "reward_std": 0.04232724383473396, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 402.5089416503906, + "completions/mean_terminated_length": 402.5089416503906, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.1145215372710859, + "grad_norm": 0.6906930804252625, + "kl": 0.030242919921875, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 12818244.0, + "reward": 1.1343750953674316, + "reward_std": 0.08465448766946793, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 1179.0, + "completions/mean_length": 411.76788330078125, + "completions/mean_terminated_length": 411.76788330078125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.11555326283208667, + "grad_norm": 0.6561564207077026, + "kl": 0.025909423828125, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 12924310.0, + "reward": 1.1187500953674316, + "reward_std": 0.06367506086826324, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 432.3839416503906, + "completions/mean_terminated_length": 432.3839416503906, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.11658498839308744, + "grad_norm": 0.7264392971992493, + "kl": 0.026153564453125, + "learning_rate": 1e-06, + "loss": -0.0138, + "num_tokens": 13042268.0, + "reward": 1.09375, + "reward_std": 0.09394123405218124, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1294.0, + "completions/max_terminated_length": 1294.0, + "completions/mean_length": 435.27679443359375, + "completions/mean_terminated_length": 435.27679443359375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.11761671395408821, + "grad_norm": 0.5870400071144104, + "kl": 0.0267333984375, + "learning_rate": 1e-06, + "loss": 0.0287, + "num_tokens": 13162461.0, + "reward": 1.1093751192092896, + "reward_std": 0.07296179980039597, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 478.2232360839844, + "completions/mean_terminated_length": 478.2232360839844, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.11864843951508898, + "grad_norm": 0.4974851608276367, + "kl": 0.025238037109375, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 13290187.0, + "reward": 1.0250000953674316, + "reward_std": 0.054419707506895065, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.02500000037252903, + "rewards/curriculum_aware_reward_fn/std": 0.09054389595985413, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 985.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 481.544677734375, + "completions/mean_terminated_length": 481.544677734375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.11968016507608976, + "grad_norm": 0.47406908869743347, + "kl": 0.0255126953125, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 13405972.0, + "reward": 1.1187500953674316, + "reward_std": 0.04818928614258766, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 441.0089416503906, + "completions/mean_terminated_length": 441.0089416503906, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.12071189063709054, + "grad_norm": 0.5233811736106873, + "kl": 0.02685546875, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 13514823.0, + "reward": 1.1281250715255737, + "reward_std": 0.0612691193819046, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1054.0, + "completions/max_terminated_length": 1054.0, + "completions/mean_length": 482.0000305175781, + "completions/mean_terminated_length": 482.0000305175781, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.1217436161980913, + "grad_norm": 0.575183629989624, + "kl": 0.02606201171875, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 13631562.0, + "reward": 1.0562500953674316, + "reward_std": 0.06851832568645477, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 471.7410888671875, + "completions/mean_terminated_length": 471.7410888671875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.12277534175909208, + "grad_norm": 0.6857424378395081, + "kl": 0.02569580078125, + "learning_rate": 1e-06, + "loss": -0.0119, + "num_tokens": 13757400.0, + "reward": 1.1031250953674316, + "reward_std": 0.08709181845188141, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 484.58929443359375, + "completions/mean_terminated_length": 484.58929443359375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.12380706732009286, + "grad_norm": 0.4834694266319275, + "kl": 0.026031494140625, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 13879826.0, + "reward": 1.0687501430511475, + "reward_std": 0.059881966561079025, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1272.0, + "completions/max_terminated_length": 1272.0, + "completions/mean_length": 500.0982360839844, + "completions/mean_terminated_length": 500.0982360839844, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.12483879288109363, + "grad_norm": 0.5000414252281189, + "kl": 0.024505615234375, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 14007160.0, + "reward": 1.1218751668930054, + "reward_std": 0.06228789687156677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 443.3214416503906, + "completions/mean_terminated_length": 443.3214416503906, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.1258705184420944, + "grad_norm": 0.6494409441947937, + "kl": 0.024810791015625, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 14124453.0, + "reward": 1.0812500715255737, + "reward_std": 0.08746020495891571, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 460.3035888671875, + "completions/mean_terminated_length": 460.3035888671875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.1269022440030952, + "grad_norm": 0.5224548578262329, + "kl": 0.0277099609375, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 14248128.0, + "reward": 1.1281250715255737, + "reward_std": 0.0616375133395195, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 432.02679443359375, + "completions/mean_terminated_length": 432.02679443359375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.12793396956409595, + "grad_norm": 0.7065784335136414, + "kl": 0.028961181640625, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 14367362.0, + "reward": 1.1187500953674316, + "reward_std": 0.08607304841279984, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1797.0, + "completions/max_terminated_length": 1797.0, + "completions/mean_length": 461.1607360839844, + "completions/mean_terminated_length": 461.1607360839844, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.12896569512509673, + "grad_norm": 0.7404457330703735, + "kl": 0.03076171875, + "learning_rate": 1e-06, + "loss": 0.0328, + "num_tokens": 14487052.0, + "reward": 1.1625001430511475, + "reward_std": 0.12836889922618866, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 425.0714416503906, + "completions/mean_terminated_length": 425.0714416503906, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.1299974206860975, + "grad_norm": 0.7103374600410461, + "kl": 0.025634765625, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 14603547.0, + "reward": 1.1125000715255737, + "reward_std": 0.08847897499799728, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 390.2321472167969, + "completions/mean_terminated_length": 390.2321472167969, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.13102914624709827, + "grad_norm": 0.6540249586105347, + "kl": 0.029022216796875, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 14708335.0, + "reward": 1.140625, + "reward_std": 0.09773431718349457, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 442.0000305175781, + "completions/mean_terminated_length": 442.0000305175781, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.13206087180809906, + "grad_norm": 0.5070663094520569, + "kl": 0.0247802734375, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 14830408.0, + "reward": 1.0656250715255737, + "reward_std": 0.06330667436122894, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.13722331821918488, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1415.0, + "completions/max_terminated_length": 1415.0, + "completions/mean_length": 449.7232360839844, + "completions/mean_terminated_length": 449.7232360839844, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.1330925973690998, + "grad_norm": 0.5839290022850037, + "kl": 0.02423095703125, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 14948593.0, + "reward": 1.1218750476837158, + "reward_std": 0.07438036054372787, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 395.7589416503906, + "completions/mean_terminated_length": 395.7589416503906, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.1341243229301006, + "grad_norm": 0.6906790137290955, + "kl": 0.027679443359375, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 15069365.0, + "reward": 1.084375023841858, + "reward_std": 0.09088490903377533, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 379.3750305175781, + "completions/mean_terminated_length": 379.3750305175781, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.13515604849110135, + "grad_norm": 0.8479922413825989, + "kl": 0.03192138671875, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 15179328.0, + "reward": 1.1062501668930054, + "reward_std": 0.10843963176012039, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 446.8660888671875, + "completions/mean_terminated_length": 446.8660888671875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.13618777405210214, + "grad_norm": 0.49666714668273926, + "kl": 0.02545166015625, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 15295282.0, + "reward": 1.078125, + "reward_std": 0.06509362161159515, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 397.20538330078125, + "completions/mean_terminated_length": 397.20538330078125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.13721949961310292, + "grad_norm": 0.66218501329422, + "kl": 0.02874755859375, + "learning_rate": 1e-06, + "loss": 0.0243, + "num_tokens": 15402398.0, + "reward": 1.100000023841858, + "reward_std": 0.07539913058280945, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 404.3214416503906, + "completions/mean_terminated_length": 404.3214416503906, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.13825122517410368, + "grad_norm": 0.6109785437583923, + "kl": 0.025726318359375, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 15510143.0, + "reward": 1.0718750953674316, + "reward_std": 0.060281746089458466, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 374.3125305175781, + "completions/mean_terminated_length": 374.3125305175781, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.13928295073510447, + "grad_norm": 0.5623469948768616, + "kl": 0.0333251953125, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 15614842.0, + "reward": 1.1000001430511475, + "reward_std": 0.05682564526796341, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 453.794677734375, + "completions/mean_terminated_length": 453.794677734375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.14031467629610522, + "grad_norm": 0.532106876373291, + "kl": 0.026397705078125, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 15738971.0, + "reward": 1.0625, + "reward_std": 0.06709976494312286, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0625, + "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 369.7857360839844, + "completions/mean_terminated_length": 369.7857360839844, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.141346401857106, + "grad_norm": 0.6140812039375305, + "kl": 0.030914306640625, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 15854524.0, + "reward": 1.1031250953674316, + "reward_std": 0.05096359923481941, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 332.6875, + "completions/mean_terminated_length": 332.6875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.1423781274181068, + "grad_norm": 0.7683741450309753, + "kl": 0.0350341796875, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 15957291.0, + "reward": 1.1218750476837158, + "reward_std": 0.08706042170524597, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 328.9285888671875, + "completions/mean_terminated_length": 328.9285888671875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.14340985297910755, + "grad_norm": 0.6324326395988464, + "kl": 0.03460693359375, + "learning_rate": 1e-06, + "loss": -0.0228, + "num_tokens": 16065335.0, + "reward": 1.1500002145767212, + "reward_std": 0.05543847754597664, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 360.6071472167969, + "completions/mean_terminated_length": 360.6071472167969, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.14444157854010833, + "grad_norm": 0.6328318119049072, + "kl": 0.034393310546875, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 16179272.0, + "reward": 1.109375, + "reward_std": 0.0612691268324852, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 337.7410888671875, + "completions/mean_terminated_length": 337.7410888671875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.14547330410110912, + "grad_norm": 0.7572005391120911, + "kl": 0.03558349609375, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 16277775.0, + "reward": 1.09375, + "reward_std": 0.08261694014072418, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1168.0, + "completions/max_terminated_length": 1168.0, + "completions/mean_length": 392.33038330078125, + "completions/mean_terminated_length": 392.33038330078125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.14650502966210988, + "grad_norm": 0.6479676365852356, + "kl": 0.03375244140625, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 16392570.0, + "reward": 1.0785715579986572, + "reward_std": 0.07929752767086029, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1152.0, + "completions/max_terminated_length": 1152.0, + "completions/mean_length": 432.8214416503906, + "completions/mean_terminated_length": 432.8214416503906, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.14753675522311066, + "grad_norm": 0.5537639260292053, + "kl": 0.028564453125, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 16515001.0, + "reward": 1.068750023841858, + "reward_std": 0.07296179980039597, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 372.6071472167969, + "completions/mean_terminated_length": 372.6071472167969, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.14856848078411142, + "grad_norm": 0.7606719732284546, + "kl": 0.03546142578125, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 16620524.0, + "reward": 1.1156251430511475, + "reward_std": 0.08746019750833511, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1073.0, + "completions/max_terminated_length": 1073.0, + "completions/mean_length": 375.5446472167969, + "completions/mean_terminated_length": 375.5446472167969, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.1496002063451122, + "grad_norm": 0.6230894327163696, + "kl": 0.0325927734375, + "learning_rate": 1e-06, + "loss": -0.015, + "num_tokens": 16728742.0, + "reward": 1.0812500715255737, + "reward_std": 0.08465448766946793, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14843402802944183, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 376.21429443359375, + "completions/mean_terminated_length": 376.21429443359375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.150631931906113, + "grad_norm": 0.6115962862968445, + "kl": 0.0338134765625, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 16844434.0, + "reward": 1.0598214864730835, + "reward_std": 0.08146720379590988, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 373.26788330078125, + "completions/mean_terminated_length": 373.26788330078125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.15166365746711374, + "grad_norm": 0.7758038640022278, + "kl": 0.03533935546875, + "learning_rate": 1e-06, + "loss": 0.0179, + "num_tokens": 16948799.0, + "reward": 1.1625001430511475, + "reward_std": 0.08505426347255707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 368.0089416503906, + "completions/mean_terminated_length": 368.0089416503906, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.15269538302811453, + "grad_norm": 0.7160394191741943, + "kl": 0.037841796875, + "learning_rate": 1e-06, + "loss": 0.0157, + "num_tokens": 17055816.0, + "reward": 1.0968750715255737, + "reward_std": 0.07817345857620239, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.15729717910289764, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 385.4821472167969, + "completions/mean_terminated_length": 385.4821472167969, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.15372710858911529, + "grad_norm": 0.79217529296875, + "kl": 0.03387451171875, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 17157433.0, + "reward": 1.1031250953674316, + "reward_std": 0.10705246776342392, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 374.58929443359375, + "completions/mean_terminated_length": 374.58929443359375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.15475883415011607, + "grad_norm": 0.6225085258483887, + "kl": 0.032196044921875, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 17265978.0, + "reward": 1.1218751668930054, + "reward_std": 0.06330667436122894, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 393.5357360839844, + "completions/mean_terminated_length": 393.5357360839844, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.15579055971111685, + "grad_norm": 0.6503400206565857, + "kl": 0.03387451171875, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 17374356.0, + "reward": 1.1343750953674316, + "reward_std": 0.07780507206916809, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 392.0982360839844, + "completions/mean_terminated_length": 392.0982360839844, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.1568222852721176, + "grad_norm": 0.7472891807556152, + "kl": 0.033172607421875, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 17486423.0, + "reward": 1.140625, + "reward_std": 0.10119043290615082, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 367.26788330078125, + "completions/mean_terminated_length": 367.26788330078125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.1578540108331184, + "grad_norm": 0.7780349254608154, + "kl": 0.0379638671875, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 17586854.0, + "reward": 1.0968750715255737, + "reward_std": 0.10637068003416061, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 424.5625305175781, + "completions/mean_terminated_length": 424.5625305175781, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.15888573639411915, + "grad_norm": 0.5371277928352356, + "kl": 0.04486083984375, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 17708969.0, + "reward": 1.1000001430511475, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 361.1160888671875, + "completions/mean_terminated_length": 361.1160888671875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.15991746195511994, + "grad_norm": 0.650229811668396, + "kl": 0.03973388671875, + "learning_rate": 1e-06, + "loss": -0.0119, + "num_tokens": 17814178.0, + "reward": 1.1031250953674316, + "reward_std": 0.07882384210824966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 369.1875305175781, + "completions/mean_terminated_length": 369.1875305175781, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.16094918751612072, + "grad_norm": 0.7069404125213623, + "kl": 0.03704833984375, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 17927543.0, + "reward": 1.1000001430511475, + "reward_std": 0.09532840549945831, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 362.58929443359375, + "completions/mean_terminated_length": 362.58929443359375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.16198091307712148, + "grad_norm": 0.6486988067626953, + "kl": 0.04095458984375, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 18035395.0, + "reward": 1.1218751668930054, + "reward_std": 0.059231579303741455, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 360.27679443359375, + "completions/mean_terminated_length": 360.27679443359375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.16301263863812226, + "grad_norm": 0.7021763920783997, + "kl": 0.03875732421875, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 18135925.0, + "reward": 1.0812500715255737, + "reward_std": 0.08400409668684006, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 335.83929443359375, + "completions/mean_terminated_length": 335.83929443359375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.16404436419912302, + "grad_norm": 0.7946338057518005, + "kl": 0.04852294921875, + "learning_rate": 1e-06, + "loss": -0.0231, + "num_tokens": 18237057.0, + "reward": 1.0843751430511475, + "reward_std": 0.10982678830623627, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 364.2321472167969, + "completions/mean_terminated_length": 364.2321472167969, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.1650760897601238, + "grad_norm": 0.5329174995422363, + "kl": 0.0428466796875, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 18350553.0, + "reward": 1.0593751668930054, + "reward_std": 0.053001150488853455, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499925494194, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 325.3571472167969, + "completions/mean_terminated_length": 325.3571472167969, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.1661078153211246, + "grad_norm": 0.6059619188308716, + "kl": 0.05255126953125, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 18455292.0, + "reward": 1.1468751430511475, + "reward_std": 0.06953709572553635, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 351.8214416503906, + "completions/mean_terminated_length": 351.8214416503906, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.16713954088212535, + "grad_norm": 0.787862241268158, + "kl": 0.0474853515625, + "learning_rate": 1e-06, + "loss": 0.0233, + "num_tokens": 18557857.0, + "reward": 1.140625238418579, + "reward_std": 0.10260899364948273, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1058.0, + "completions/max_terminated_length": 1058.0, + "completions/mean_length": 379.33929443359375, + "completions/mean_terminated_length": 379.33929443359375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.16817126644312613, + "grad_norm": 0.5836665034294128, + "kl": 0.0445556640625, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 18668040.0, + "reward": 1.0656250715255737, + "reward_std": 0.06130051985383034, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.13722331821918488, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 305.39288330078125, + "completions/mean_terminated_length": 305.39288330078125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.1692029920041269, + "grad_norm": 0.588325023651123, + "kl": 0.0758056640625, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 18773707.0, + "reward": 1.1531251668930054, + "reward_std": 0.05401992052793503, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 349.0535888671875, + "completions/mean_terminated_length": 349.0535888671875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.17023471756512767, + "grad_norm": 0.5635197162628174, + "kl": 0.04345703125, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 18882122.0, + "reward": 1.09375, + "reward_std": 0.07882384210824966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194, + "rewards/curriculum_aware_reward_fn/std": 0.15569154918193817, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 251.52679443359375, + "completions/mean_terminated_length": 251.52679443359375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.17126644312612846, + "grad_norm": 0.9392991662025452, + "kl": 0.05517578125, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 18963171.0, + "reward": 1.2062500715255737, + "reward_std": 0.11976392567157745, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20624998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 319.33929443359375, + "completions/mean_terminated_length": 319.33929443359375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.17229816868712922, + "grad_norm": 0.6024711728096008, + "kl": 0.0548095703125, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 19059935.0, + "reward": 1.1406251192092896, + "reward_std": 0.05198238044977188, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 327.8482360839844, + "completions/mean_terminated_length": 327.8482360839844, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.17332989424813, + "grad_norm": 0.6369600296020508, + "kl": 0.0540771484375, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 19154831.0, + "reward": 1.1062501668930054, + "reward_std": 0.055438488721847534, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 339.6696472167969, + "completions/mean_terminated_length": 339.6696472167969, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.17436161980913076, + "grad_norm": 0.6260164380073547, + "kl": 0.0457763671875, + "learning_rate": 1e-06, + "loss": -0.009, + "num_tokens": 19253165.0, + "reward": 1.1156251430511475, + "reward_std": 0.06228789687156677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1352.0, + "completions/max_terminated_length": 1352.0, + "completions/mean_length": 331.9107360839844, + "completions/mean_terminated_length": 331.9107360839844, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.17539334537013154, + "grad_norm": 0.6983166933059692, + "kl": 0.0447998046875, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 19356020.0, + "reward": 1.1156251430511475, + "reward_std": 0.0981341153383255, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.17190392315387726, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 369.8035888671875, + "completions/mean_terminated_length": 369.8035888671875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.17642507093113233, + "grad_norm": 0.3841709792613983, + "kl": 0.046875, + "learning_rate": 1e-06, + "loss": 0.009, + "num_tokens": 19471267.0, + "reward": 1.0593751668930054, + "reward_std": 0.043346013873815536, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 329.77679443359375, + "completions/mean_terminated_length": 329.77679443359375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.17745679649213308, + "grad_norm": 0.6240721344947815, + "kl": 0.0479736328125, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 19569081.0, + "reward": 1.1031250953674316, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 363.8660888671875, + "completions/mean_terminated_length": 363.8660888671875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.17848852205313387, + "grad_norm": 0.7446913719177246, + "kl": 0.04083251953125, + "learning_rate": 1e-06, + "loss": -0.0144, + "num_tokens": 19669096.0, + "reward": 1.1500000953674316, + "reward_std": 0.08122977614402771, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 320.9732360839844, + "completions/mean_terminated_length": 320.9732360839844, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.17952024761413465, + "grad_norm": 0.7765859365463257, + "kl": 0.07684326171875, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 19765840.0, + "reward": 1.125, + "reward_std": 0.0671311616897583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 360.9285888671875, + "completions/mean_terminated_length": 360.9285888671875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.1805519731751354, + "grad_norm": 0.7132924199104309, + "kl": 0.04248046875, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 19867556.0, + "reward": 1.1093751192092896, + "reward_std": 0.09430961310863495, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1154.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 376.3750305175781, + "completions/mean_terminated_length": 376.3750305175781, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.1815836987361362, + "grad_norm": 0.5997862815856934, + "kl": 0.04632568359375, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 19973858.0, + "reward": 1.15625, + "reward_std": 0.08159816265106201, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.21446822583675385, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 366.76788330078125, + "completions/mean_terminated_length": 366.76788330078125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.18261542429713695, + "grad_norm": 0.5875352025032043, + "kl": 0.0478515625, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 20090788.0, + "reward": 1.0875000953674316, + "reward_std": 0.0781734511256218, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1195.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 409.6250305175781, + "completions/mean_terminated_length": 409.6250305175781, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.18364714985813774, + "grad_norm": 0.6117048263549805, + "kl": 0.0433349609375, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 20214415.0, + "reward": 1.1093751192092896, + "reward_std": 0.09051652252674103, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1059.0, + "completions/max_terminated_length": 1059.0, + "completions/mean_length": 322.4375, + "completions/mean_terminated_length": 322.4375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.18467887541913852, + "grad_norm": 0.6856574416160583, + "kl": 0.05731201171875, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 20318302.0, + "reward": 1.1160715818405151, + "reward_std": 0.07319922745227814, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 375.40179443359375, + "completions/mean_terminated_length": 375.40179443359375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.18571060098013928, + "grad_norm": 0.6979689598083496, + "kl": 0.0533447265625, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 20428394.0, + "reward": 1.1156251430511475, + "reward_std": 0.07919223606586456, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 351.1785888671875, + "completions/mean_terminated_length": 351.1785888671875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.18674232654114006, + "grad_norm": 0.6818945407867432, + "kl": 0.0538330078125, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 20538370.0, + "reward": 1.0875000953674316, + "reward_std": 0.07576751708984375, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 335.375, + "completions/mean_terminated_length": 335.375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.18777405210214082, + "grad_norm": 0.8694007992744446, + "kl": 0.0506591796875, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 20648872.0, + "reward": 1.1625001430511475, + "reward_std": 0.1025775894522667, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16249999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 358.64288330078125, + "completions/mean_terminated_length": 358.64288330078125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.1888057776631416, + "grad_norm": 0.5991372466087341, + "kl": 0.0462646484375, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 20752495.0, + "reward": 1.0875000953674316, + "reward_std": 0.05645725876092911, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 357.65179443359375, + "completions/mean_terminated_length": 357.65179443359375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.1898375032241424, + "grad_norm": 0.7000672221183777, + "kl": 0.04669189453125, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 20853662.0, + "reward": 1.109375, + "reward_std": 0.08502288162708282, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 377.95538330078125, + "completions/mean_terminated_length": 377.95538330078125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.19086922878514315, + "grad_norm": 0.6148974299430847, + "kl": 0.0625, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 20965608.0, + "reward": 1.0906250476837158, + "reward_std": 0.07231142371892929, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1100.0, + "completions/max_terminated_length": 1100.0, + "completions/mean_length": 381.6339416503906, + "completions/mean_terminated_length": 381.6339416503906, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.19190095434614393, + "grad_norm": 0.655315637588501, + "kl": 0.052734375, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 21079142.0, + "reward": 1.1187500953674316, + "reward_std": 0.06509362161159515, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 331.65179443359375, + "completions/mean_terminated_length": 331.65179443359375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.1929326799071447, + "grad_norm": 0.6411604285240173, + "kl": 0.051025390625, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 21179616.0, + "reward": 1.1500000953674316, + "reward_std": 0.05645725876092911, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15000000596046448, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 345.6964416503906, + "completions/mean_terminated_length": 345.6964416503906, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.19396440546814547, + "grad_norm": 0.6874846816062927, + "kl": 0.05078125, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 21284729.0, + "reward": 1.1343750953674316, + "reward_std": 0.07882384210824966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 346.27679443359375, + "completions/mean_terminated_length": 346.27679443359375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.19499613102914626, + "grad_norm": 0.7140980362892151, + "kl": 0.0557861328125, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 21387114.0, + "reward": 1.0750001668930054, + "reward_std": 0.08264832943677902, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 317.1964416503906, + "completions/mean_terminated_length": 317.1964416503906, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.19602785659014701, + "grad_norm": 0.760844886302948, + "kl": 0.0556640625, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 21492696.0, + "reward": 1.1281250715255737, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 363.3839416503906, + "completions/mean_terminated_length": 363.3839416503906, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.1970595821511478, + "grad_norm": 0.8610868453979492, + "kl": 0.05108642578125, + "learning_rate": 1e-06, + "loss": -0.0172, + "num_tokens": 21592888.0, + "reward": 1.125, + "reward_std": 0.1225382462143898, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1249999925494194, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 325.26788330078125, + "completions/mean_terminated_length": 325.26788330078125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.19809130771214856, + "grad_norm": 0.6842600107192993, + "kl": 0.05517578125, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 21690848.0, + "reward": 1.15625, + "reward_std": 0.07231142371892929, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.174774631857872, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 322.7232360839844, + "completions/mean_terminated_length": 322.7232360839844, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.19912303327314934, + "grad_norm": 0.7429232597351074, + "kl": 0.05718994140625, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 21797154.0, + "reward": 1.1281250715255737, + "reward_std": 0.08746020495891571, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 346.83038330078125, + "completions/mean_terminated_length": 346.83038330078125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.20015475883415013, + "grad_norm": 0.8076887130737305, + "kl": 0.06341552734375, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 21907826.0, + "reward": 1.1125000715255737, + "reward_std": 0.09535979479551315, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1751.0, + "completions/max_terminated_length": 1751.0, + "completions/mean_length": 386.58038330078125, + "completions/mean_terminated_length": 386.58038330078125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.20118648439515088, + "grad_norm": 0.47773370146751404, + "kl": 0.04888916015625, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 22020528.0, + "reward": 1.1125000715255737, + "reward_std": 0.05198238044977188, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.20593667030334473, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 337.0535888671875, + "completions/mean_terminated_length": 337.0535888671875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.20221820995615167, + "grad_norm": 0.6952309608459473, + "kl": 0.0533447265625, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 22116819.0, + "reward": 1.1000001430511475, + "reward_std": 0.0812297835946083, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 383.46429443359375, + "completions/mean_terminated_length": 383.46429443359375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.20324993551715242, + "grad_norm": 0.7165126204490662, + "kl": 0.05242919921875, + "learning_rate": 1e-06, + "loss": 0.0237, + "num_tokens": 22221465.0, + "reward": 1.09375, + "reward_std": 0.09430960565805435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 340.7589416503906, + "completions/mean_terminated_length": 340.7589416503906, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.2042816610781532, + "grad_norm": 0.5596606135368347, + "kl": 0.05694580078125, + "learning_rate": 1e-06, + "loss": -0.0137, + "num_tokens": 22319634.0, + "reward": 1.1187500953674316, + "reward_std": 0.06025035306811333, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 318.96429443359375, + "completions/mean_terminated_length": 318.96429443359375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.205313386639154, + "grad_norm": 0.5254331231117249, + "kl": 0.063720703125, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 22417224.0, + "reward": 1.0968750715255737, + "reward_std": 0.049576446413993835, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 358.9196472167969, + "completions/mean_terminated_length": 358.9196472167969, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.20634511220015475, + "grad_norm": 0.552406907081604, + "kl": 0.05364990234375, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 22534311.0, + "reward": 1.0937501192092896, + "reward_std": 0.05198238044977188, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 318.2321472167969, + "completions/mean_terminated_length": 318.2321472167969, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.20737683776115554, + "grad_norm": 0.6902104020118713, + "kl": 0.06365966796875, + "learning_rate": 1e-06, + "loss": 0.0205, + "num_tokens": 22636453.0, + "reward": 1.09375, + "reward_std": 0.07536774128675461, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 358.1339416503906, + "completions/mean_terminated_length": 358.1339416503906, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.2084085633221563, + "grad_norm": 0.45866531133651733, + "kl": 0.054931640625, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 22744485.0, + "reward": 1.0500000715255737, + "reward_std": 0.04130847007036209, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05000000074505806, + "rewards/curriculum_aware_reward_fn/std": 0.12302493304014206, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 307.2589416503906, + "completions/mean_terminated_length": 307.2589416503906, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.20944028888315708, + "grad_norm": 0.5563469529151917, + "kl": 0.069580078125, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 22838671.0, + "reward": 1.0781251192092896, + "reward_std": 0.028228627517819405, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 305.9107360839844, + "completions/mean_terminated_length": 305.9107360839844, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.21047201444415786, + "grad_norm": 0.9100491404533386, + "kl": 0.068359375, + "learning_rate": 1e-06, + "loss": 0.0135, + "num_tokens": 22942639.0, + "reward": 1.1375001668930054, + "reward_std": 0.10803984105587006, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13749998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1178.0, + "completions/max_terminated_length": 1178.0, + "completions/mean_length": 385.33038330078125, + "completions/mean_terminated_length": 385.33038330078125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.21150374000515862, + "grad_norm": 0.7055003046989441, + "kl": 0.05218505859375, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 23049748.0, + "reward": 1.1160715818405151, + "reward_std": 0.10426744073629379, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845881938934326, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1795.0, + "completions/max_terminated_length": 1795.0, + "completions/mean_length": 399.8214416503906, + "completions/mean_terminated_length": 399.8214416503906, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.2125354655661594, + "grad_norm": 0.6340644955635071, + "kl": 0.0445556640625, + "learning_rate": 1e-06, + "loss": 0.0182, + "num_tokens": 23167216.0, + "reward": 1.0718750953674316, + "reward_std": 0.07333020120859146, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1077.0, + "completions/max_terminated_length": 1077.0, + "completions/mean_length": 404.6607360839844, + "completions/mean_terminated_length": 404.6607360839844, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.2135671911271602, + "grad_norm": 0.48796433210372925, + "kl": 0.052734375, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 23297098.0, + "reward": 1.0437500476837158, + "reward_std": 0.05198238044977188, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.04374999925494194, + "rewards/curriculum_aware_reward_fn/std": 0.11627185344696045, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 345.52679443359375, + "completions/mean_terminated_length": 345.52679443359375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.21459891668816095, + "grad_norm": 0.6831227540969849, + "kl": 0.05828857421875, + "learning_rate": 1e-06, + "loss": -0.0164, + "num_tokens": 23405650.0, + "reward": 1.1031250953674316, + "reward_std": 0.06228789687156677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499850988388, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 330.5982360839844, + "completions/mean_terminated_length": 330.5982360839844, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.21563064224916173, + "grad_norm": 0.6812708377838135, + "kl": 0.0550537109375, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 23508113.0, + "reward": 1.1218751668930054, + "reward_std": 0.06130051985383034, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 375.83038330078125, + "completions/mean_terminated_length": 375.83038330078125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.2166623678101625, + "grad_norm": 0.6429232358932495, + "kl": 0.04766845703125, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 23622033.0, + "reward": 1.1062500476837158, + "reward_std": 0.0812297835946083, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 304.2410888671875, + "completions/mean_terminated_length": 304.2410888671875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.21769409337116327, + "grad_norm": 0.8792291283607483, + "kl": 0.06829833984375, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 23715664.0, + "reward": 1.1593750715255737, + "reward_std": 0.10159020870923996, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 362.6696472167969, + "completions/mean_terminated_length": 362.6696472167969, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.21872581893216406, + "grad_norm": 0.7209181189537048, + "kl": 0.053955078125, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 23823280.0, + "reward": 1.0906251668930054, + "reward_std": 0.0860416442155838, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 367.6071472167969, + "completions/mean_terminated_length": 367.6071472167969, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.2197575444931648, + "grad_norm": 0.4966450035572052, + "kl": 0.0498046875, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 23930238.0, + "reward": 1.0562500953674316, + "reward_std": 0.04130847379565239, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05624999478459358, + "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 351.3571472167969, + "completions/mean_terminated_length": 351.3571472167969, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.2207892700541656, + "grad_norm": 0.7580758929252625, + "kl": 0.05255126953125, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 24030686.0, + "reward": 1.0973215103149414, + "reward_std": 0.11033187061548233, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 342.2589416503906, + "completions/mean_terminated_length": 342.2589416503906, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.22182099561516636, + "grad_norm": 0.7508425116539001, + "kl": 0.05023193359375, + "learning_rate": 1e-06, + "loss": 0.0152, + "num_tokens": 24134280.0, + "reward": 1.1156251430511475, + "reward_std": 0.09674695879220963, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 361.6071472167969, + "completions/mean_terminated_length": 361.6071472167969, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.22285272117616714, + "grad_norm": 0.7472397685050964, + "kl": 0.05438232421875, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 24236149.0, + "reward": 1.1062501668930054, + "reward_std": 0.11084556579589844, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 345.1785888671875, + "completions/mean_terminated_length": 345.1785888671875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.22388444673716792, + "grad_norm": 0.7892441153526306, + "kl": 0.05853271484375, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 24350395.0, + "reward": 1.1156251430511475, + "reward_std": 0.08847897499799728, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 383.6160888671875, + "completions/mean_terminated_length": 383.6160888671875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.22491617229816868, + "grad_norm": 0.6343303918838501, + "kl": 0.0494384765625, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 24463178.0, + "reward": 1.1000001430511475, + "reward_std": 0.08465448766946793, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 350.3571472167969, + "completions/mean_terminated_length": 350.3571472167969, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.22594789785916947, + "grad_norm": 0.7159443497657776, + "kl": 0.0528564453125, + "learning_rate": 1e-06, + "loss": -0.0096, + "num_tokens": 24574693.0, + "reward": 1.1093751192092896, + "reward_std": 0.07539913803339005, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 372.33038330078125, + "completions/mean_terminated_length": 372.33038330078125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.22697962342017022, + "grad_norm": 0.6375694870948792, + "kl": 0.04815673828125, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 24693037.0, + "reward": 1.0812500715255737, + "reward_std": 0.07539913803339005, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 330.7857360839844, + "completions/mean_terminated_length": 330.7857360839844, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.228011348981171, + "grad_norm": 0.6578212380409241, + "kl": 0.06341552734375, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 24797355.0, + "reward": 1.078125, + "reward_std": 0.06469383090734482, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 361.0089416503906, + "completions/mean_terminated_length": 361.0089416503906, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.2290430745421718, + "grad_norm": 0.6693058609962463, + "kl": 0.0523681640625, + "learning_rate": 1e-06, + "loss": 0.0271, + "num_tokens": 24914732.0, + "reward": 1.0843751430511475, + "reward_std": 0.09532838314771652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 390.8214416503906, + "completions/mean_terminated_length": 390.8214416503906, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.23007480010317255, + "grad_norm": 0.6660776138305664, + "kl": 0.051513671875, + "learning_rate": 1e-06, + "loss": 0.0148, + "num_tokens": 25030345.0, + "reward": 1.1000001430511475, + "reward_std": 0.10637068003416061, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 375.1875305175781, + "completions/mean_terminated_length": 375.1875305175781, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.23110652566417333, + "grad_norm": 0.5172949433326721, + "kl": 0.05389404296875, + "learning_rate": 1e-06, + "loss": 0.0174, + "num_tokens": 25146055.0, + "reward": 1.0625, + "reward_std": 0.0612691231071949, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0625, + "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 340.08038330078125, + "completions/mean_terminated_length": 340.08038330078125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.2321382512251741, + "grad_norm": 0.8066679835319519, + "kl": 0.0545654296875, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 25256727.0, + "reward": 1.1531251668930054, + "reward_std": 0.10807124525308609, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 329.125, + "completions/mean_terminated_length": 329.125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.23316997678617488, + "grad_norm": 0.7467411160469055, + "kl": 0.05596923828125, + "learning_rate": 1e-06, + "loss": 0.024, + "num_tokens": 25356526.0, + "reward": 1.1812502145767212, + "reward_std": 0.07197443395853043, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 351.45538330078125, + "completions/mean_terminated_length": 351.45538330078125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.23420170234717566, + "grad_norm": 0.6787609457969666, + "kl": 0.0518798828125, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 25464321.0, + "reward": 1.1750000715255737, + "reward_std": 0.09915289282798767, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2101050615310669, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1135.0, + "completions/max_terminated_length": 1135.0, + "completions/mean_length": 411.58929443359375, + "completions/mean_terminated_length": 411.58929443359375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.23523342790817642, + "grad_norm": 0.5712829232215881, + "kl": 0.05523681640625, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 25584213.0, + "reward": 1.1250001192092896, + "reward_std": 0.05161399021744728, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 351.6785888671875, + "completions/mean_terminated_length": 351.6785888671875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.2362651534691772, + "grad_norm": 0.6961349844932556, + "kl": 0.05596923828125, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 25699635.0, + "reward": 1.1160714626312256, + "reward_std": 0.08228814601898193, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 344.2589416503906, + "completions/mean_terminated_length": 344.2589416503906, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.23729687903017796, + "grad_norm": 0.869304895401001, + "kl": 0.0552978515625, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 25798252.0, + "reward": 1.2000001668930054, + "reward_std": 0.1284002959728241, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20000001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 354.4464416503906, + "completions/mean_terminated_length": 354.4464416503906, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.23832860459117874, + "grad_norm": 0.6629135608673096, + "kl": 0.0545654296875, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 25906970.0, + "reward": 1.1375001668930054, + "reward_std": 0.08363571017980576, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170318961143494, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 365.8125305175781, + "completions/mean_terminated_length": 365.8125305175781, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.23936033015217953, + "grad_norm": 0.6239770650863647, + "kl": 0.044677734375, + "learning_rate": 1e-06, + "loss": 0.0205, + "num_tokens": 26003752.0, + "reward": 1.1312501430511475, + "reward_std": 0.07641790807247162, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 378.6160888671875, + "completions/mean_terminated_length": 378.6160888671875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.24039205571318029, + "grad_norm": 0.6367329955101013, + "kl": 0.04547119140625, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 26107372.0, + "reward": 1.1593750715255737, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15937498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1737.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 376.6607360839844, + "completions/mean_terminated_length": 376.6607360839844, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.24142378127418107, + "grad_norm": 0.7532415390014648, + "kl": 0.0555419921875, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 26216184.0, + "reward": 1.1375000476837158, + "reward_std": 0.10359635949134827, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170318961143494, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 378.1339416503906, + "completions/mean_terminated_length": 378.1339416503906, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.24245550683518186, + "grad_norm": 0.6156569719314575, + "kl": 0.05511474609375, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 26328087.0, + "reward": 1.0875000953674316, + "reward_std": 0.06330667436122894, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2234.0, + "completions/max_terminated_length": 2234.0, + "completions/mean_length": 457.6339416503906, + "completions/mean_terminated_length": 457.6339416503906, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.2434872323961826, + "grad_norm": 0.5021722316741943, + "kl": 0.03857421875, + "learning_rate": 1e-06, + "loss": 0.0202, + "num_tokens": 26451799.0, + "reward": 1.078125, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 408.1071472167969, + "completions/mean_terminated_length": 408.1071472167969, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.2445189579571834, + "grad_norm": 0.6339530348777771, + "kl": 0.04754638671875, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 26573660.0, + "reward": 1.0593750476837158, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 387.4464416503906, + "completions/mean_terminated_length": 387.4464416503906, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.24555068351818415, + "grad_norm": 0.7284670472145081, + "kl": 0.04962158203125, + "learning_rate": 1e-06, + "loss": 0.0168, + "num_tokens": 26678038.0, + "reward": 1.1218751668930054, + "reward_std": 0.09292246401309967, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 408.6071472167969, + "completions/mean_terminated_length": 408.6071472167969, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.24658240907918494, + "grad_norm": 0.5565657615661621, + "kl": 0.04766845703125, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 26795447.0, + "reward": 1.1343750953674316, + "reward_std": 0.05543848127126694, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 361.8839416503906, + "completions/mean_terminated_length": 361.8839416503906, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.24761413464018572, + "grad_norm": 0.39315348863601685, + "kl": 0.04290771484375, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 26908687.0, + "reward": 1.1218750476837158, + "reward_std": 0.027209853753447533, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 371.4107360839844, + "completions/mean_terminated_length": 371.4107360839844, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.24864586020118648, + "grad_norm": 0.8473479151725769, + "kl": 0.04461669921875, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 27015406.0, + "reward": 1.203125, + "reward_std": 0.13080620765686035, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.203125, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 395.5357360839844, + "completions/mean_terminated_length": 395.5357360839844, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.24967758576218727, + "grad_norm": 0.6841160655021667, + "kl": 0.0479736328125, + "learning_rate": 1e-06, + "loss": 0.0323, + "num_tokens": 27122804.0, + "reward": 1.1500002145767212, + "reward_std": 0.08468587696552277, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 453.5982360839844, + "completions/mean_terminated_length": 453.5982360839844, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.250709311323188, + "grad_norm": 0.6680525541305542, + "kl": 0.04119873046875, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 27241542.0, + "reward": 1.1156251430511475, + "reward_std": 0.09292246401309967, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 442.2410888671875, + "completions/mean_terminated_length": 442.2410888671875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.2517410368841888, + "grad_norm": 0.7665314078330994, + "kl": 0.04949951171875, + "learning_rate": 1e-06, + "loss": 0.0443, + "num_tokens": 27355144.0, + "reward": 1.1343750953674316, + "reward_std": 0.11809477210044861, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 426.3750305175781, + "completions/mean_terminated_length": 426.3750305175781, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.2527727624451896, + "grad_norm": 0.5100035071372986, + "kl": 0.0462646484375, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 27479226.0, + "reward": 1.0718750953674316, + "reward_std": 0.06571260839700699, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 401.5535888671875, + "completions/mean_terminated_length": 401.5535888671875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.2538044880061904, + "grad_norm": 0.8056138157844543, + "kl": 0.05877685546875, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 27586836.0, + "reward": 1.1343750953674316, + "reward_std": 0.1304064393043518, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437500596046448, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 975.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 422.3125305175781, + "completions/mean_terminated_length": 422.3125305175781, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.2548362135671911, + "grad_norm": 0.6086869835853577, + "kl": 0.05462646484375, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 27705724.0, + "reward": 1.15625, + "reward_std": 0.07194302976131439, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.174774631857872, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1083.0, + "completions/max_terminated_length": 1083.0, + "completions/mean_length": 431.4464416503906, + "completions/mean_terminated_length": 431.4464416503906, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.2558679391281919, + "grad_norm": 0.7313266396522522, + "kl": 0.05499267578125, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 27827280.0, + "reward": 1.1468751430511475, + "reward_std": 0.11427027732133865, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 390.96429443359375, + "completions/mean_terminated_length": 390.96429443359375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.2568996646891927, + "grad_norm": 0.6673445701599121, + "kl": 0.05084228515625, + "learning_rate": 1e-06, + "loss": -0.0127, + "num_tokens": 27931933.0, + "reward": 1.0687501430511475, + "reward_std": 0.07197443395853043, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 406.2410888671875, + "completions/mean_terminated_length": 406.2410888671875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.25793139025019346, + "grad_norm": 0.6612470746040344, + "kl": 0.045166015625, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 28044429.0, + "reward": 1.1187500953674316, + "reward_std": 0.08468588441610336, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 436.52679443359375, + "completions/mean_terminated_length": 436.52679443359375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.25896311581119424, + "grad_norm": 0.5871869921684265, + "kl": 0.0545654296875, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 28163537.0, + "reward": 1.1187500953674316, + "reward_std": 0.07194302976131439, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 440.107177734375, + "completions/mean_terminated_length": 440.107177734375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.259994841372195, + "grad_norm": 0.5709466338157654, + "kl": 0.0615234375, + "learning_rate": 1e-06, + "loss": 0.0264, + "num_tokens": 28277264.0, + "reward": 1.1250001192092896, + "reward_std": 0.05303254351019859, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1249999925494194, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1209.0, + "completions/max_terminated_length": 1209.0, + "completions/mean_length": 451.5625305175781, + "completions/mean_terminated_length": 451.5625305175781, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.26102656693319576, + "grad_norm": 0.4719817638397217, + "kl": 0.05352783203125, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 28392778.0, + "reward": 1.0968750715255737, + "reward_std": 0.057444632053375244, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.15729717910289764, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1399.0, + "completions/max_terminated_length": 1399.0, + "completions/mean_length": 460.4732360839844, + "completions/mean_terminated_length": 460.4732360839844, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.26205829249419654, + "grad_norm": 0.6896635293960571, + "kl": 0.0528564453125, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 28513105.0, + "reward": 1.1156251430511475, + "reward_std": 0.1118643507361412, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 406.7589416503906, + "completions/mean_terminated_length": 406.7589416503906, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.2630900180551973, + "grad_norm": 0.684826135635376, + "kl": 0.0616455078125, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 28623223.0, + "reward": 1.0875000953674316, + "reward_std": 0.09332224726676941, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1188.0, + "completions/max_terminated_length": 1188.0, + "completions/mean_length": 529.5982666015625, + "completions/mean_terminated_length": 529.5982666015625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.2641217436161981, + "grad_norm": 0.4367036521434784, + "kl": 0.047607421875, + "learning_rate": 1e-06, + "loss": 0.0358, + "num_tokens": 28764165.0, + "reward": 1.0906250476837158, + "reward_std": 0.06228790059685707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 989.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 399.14288330078125, + "completions/mean_terminated_length": 399.14288330078125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.26515346917719884, + "grad_norm": 0.704407274723053, + "kl": 0.062744140625, + "learning_rate": 1e-06, + "loss": 0.0264, + "num_tokens": 28881347.0, + "reward": 1.140625, + "reward_std": 0.0981341153383255, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 508.76788330078125, + "completions/mean_terminated_length": 508.76788330078125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.2661851947381996, + "grad_norm": 0.4720875918865204, + "kl": 0.05352783203125, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 29010702.0, + "reward": 1.0343750715255737, + "reward_std": 0.057844411581754684, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.03437500074505806, + "rewards/curriculum_aware_reward_fn/std": 0.10462959855794907, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 404.6339416503906, + "completions/mean_terminated_length": 404.6339416503906, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2672169202992004, + "grad_norm": 0.732636570930481, + "kl": 0.064453125, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 29114968.0, + "reward": 1.1625001430511475, + "reward_std": 0.09634716808795929, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 388.58929443359375, + "completions/mean_terminated_length": 388.58929443359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.2682486458602012, + "grad_norm": 0.6256258487701416, + "kl": 0.08599853515625, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 29224073.0, + "reward": 1.1625001430511475, + "reward_std": 0.0612691268324852, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16249999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 407.8482360839844, + "completions/mean_terminated_length": 407.8482360839844, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.269280371421202, + "grad_norm": 0.6058484315872192, + "kl": 0.0609130859375, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 29335538.0, + "reward": 1.1312501430511475, + "reward_std": 0.07055586576461792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 411.2410888671875, + "completions/mean_terminated_length": 411.2410888671875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.2703120969822027, + "grad_norm": 0.38458946347236633, + "kl": 0.0601806640625, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 29448146.0, + "reward": 1.0973215103149414, + "reward_std": 0.045989371836185455, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 434.9821472167969, + "completions/mean_terminated_length": 434.9821472167969, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.2713438225432035, + "grad_norm": 0.6179335117340088, + "kl": 0.05865478515625, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 29564827.0, + "reward": 1.1187500953674316, + "reward_std": 0.07536774128675461, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 397.08929443359375, + "completions/mean_terminated_length": 397.08929443359375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.2723755481042043, + "grad_norm": 0.7425304651260376, + "kl": 0.06341552734375, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 29679042.0, + "reward": 1.1281250715255737, + "reward_std": 0.07678630203008652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 407.02679443359375, + "completions/mean_terminated_length": 407.02679443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.27340727366520506, + "grad_norm": 0.6302496790885925, + "kl": 0.0601806640625, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 29796414.0, + "reward": 1.15625, + "reward_std": 0.06953710317611694, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.21446822583675385, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 374.89288330078125, + "completions/mean_terminated_length": 374.89288330078125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.27443899922620585, + "grad_norm": 0.5936657786369324, + "kl": 0.069091796875, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 29897335.0, + "reward": 1.1187500953674316, + "reward_std": 0.06370645761489868, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 466.9732360839844, + "completions/mean_terminated_length": 434.279296875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.2754707247872066, + "grad_norm": 0.7581308484077454, + "kl": 0.05352783203125, + "learning_rate": 1e-06, + "loss": 0.0481, + "num_tokens": 30021750.0, + "reward": 1.1348215341567993, + "reward_std": 0.13005873560905457, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 355.8482360839844, + "completions/mean_terminated_length": 355.8482360839844, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.27650245034820736, + "grad_norm": 0.685746967792511, + "kl": 0.06317138671875, + "learning_rate": 1e-06, + "loss": 0.0234, + "num_tokens": 30115862.0, + "reward": 1.15625, + "reward_std": 0.0733615905046463, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.174774631857872, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1830.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 414.9285888671875, + "completions/mean_terminated_length": 414.9285888671875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.27753417590920815, + "grad_norm": 0.7315962910652161, + "kl": 0.0662841796875, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 30218670.0, + "reward": 1.09375, + "reward_std": 0.1036277636885643, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1188.0, + "completions/max_terminated_length": 1188.0, + "completions/mean_length": 490.294677734375, + "completions/mean_terminated_length": 490.294677734375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.27856590147020893, + "grad_norm": 0.47885870933532715, + "kl": 0.048583984375, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 30342190.0, + "reward": 1.109375238418579, + "reward_std": 0.06330667436122894, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 363.15179443359375, + "completions/mean_terminated_length": 363.15179443359375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.2795976270312097, + "grad_norm": 0.4935992360115051, + "kl": 0.05731201171875, + "learning_rate": 1e-06, + "loss": -0.0129, + "num_tokens": 30449279.0, + "reward": 1.2312501668930054, + "reward_std": 0.06228789687156677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23125000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1352.0, + "completions/max_terminated_length": 1352.0, + "completions/mean_length": 445.70538330078125, + "completions/mean_terminated_length": 445.70538330078125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.28062935259221045, + "grad_norm": 0.6727720499038696, + "kl": 0.0589599609375, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 30572117.0, + "reward": 1.1187500953674316, + "reward_std": 0.07678630203008652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 416.15179443359375, + "completions/mean_terminated_length": 416.15179443359375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.28166107815321123, + "grad_norm": 0.5469335317611694, + "kl": 0.05621337890625, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 30683511.0, + "reward": 1.1062500476837158, + "reward_std": 0.05401992052793503, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 409.9910888671875, + "completions/mean_terminated_length": 409.9910888671875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.282692803714212, + "grad_norm": 0.5514264702796936, + "kl": 0.05218505859375, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 30792969.0, + "reward": 1.0906251668930054, + "reward_std": 0.062287889420986176, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 412.8839416503906, + "completions/mean_terminated_length": 412.8839416503906, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.2837245292752128, + "grad_norm": 0.8001331090927124, + "kl": 0.063720703125, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 30902044.0, + "reward": 1.1531251668930054, + "reward_std": 0.08709181845188141, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1818.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 506.9107360839844, + "completions/mean_terminated_length": 506.9107360839844, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.2847562548362136, + "grad_norm": 0.4141788184642792, + "kl": 0.053466796875, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 31025654.0, + "reward": 1.0781251192092896, + "reward_std": 0.053001150488853455, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 471.39288330078125, + "completions/mean_terminated_length": 471.39288330078125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.2857879803972143, + "grad_norm": 0.43438559770584106, + "kl": 0.0543212890625, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 31145678.0, + "reward": 1.1031250953674316, + "reward_std": 0.05059521645307541, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 490.40179443359375, + "completions/mean_terminated_length": 490.40179443359375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.2868197059582151, + "grad_norm": 0.7618850469589233, + "kl": 0.04229736328125, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 31270218.0, + "reward": 1.1343750953674316, + "reward_std": 0.12738150358200073, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1039.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 447.732177734375, + "completions/mean_terminated_length": 447.732177734375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.2878514315192159, + "grad_norm": 0.6121480464935303, + "kl": 0.051025390625, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 31381159.0, + "reward": 1.0812500715255737, + "reward_std": 0.07539913803339005, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 415.1160888671875, + "completions/mean_terminated_length": 415.1160888671875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.28888315708021667, + "grad_norm": 0.7515974044799805, + "kl": 0.0531005859375, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 31485633.0, + "reward": 1.146875023841858, + "reward_std": 0.10880802571773529, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 448.65179443359375, + "completions/mean_terminated_length": 448.65179443359375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.28991488264121745, + "grad_norm": 0.7135260105133057, + "kl": 0.05035400390625, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 31609536.0, + "reward": 1.1468751430511475, + "reward_std": 0.087091825902462, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 443.6785888671875, + "completions/mean_terminated_length": 443.6785888671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.29094660820221824, + "grad_norm": 0.6849467754364014, + "kl": 0.05706787109375, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 31723754.0, + "reward": 1.146875023841858, + "reward_std": 0.10498353838920593, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 428.2500305175781, + "completions/mean_terminated_length": 428.2500305175781, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.29197833376321897, + "grad_norm": 0.7078130841255188, + "kl": 0.05023193359375, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 31832380.0, + "reward": 1.1343750953674316, + "reward_std": 0.10359636694192886, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437500596046448, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 532.8660888671875, + "completions/mean_terminated_length": 532.8660888671875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.29301005932421975, + "grad_norm": 0.5811125636100769, + "kl": 0.04791259765625, + "learning_rate": 1e-06, + "loss": -0.0246, + "num_tokens": 31966891.0, + "reward": 1.0906251668930054, + "reward_std": 0.08224855363368988, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1306.0, + "completions/max_terminated_length": 1306.0, + "completions/mean_length": 566.25, + "completions/mean_terminated_length": 566.25, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.29404178488522054, + "grad_norm": 0.36866819858551025, + "kl": 0.04547119140625, + "learning_rate": 1e-06, + "loss": -0.0162, + "num_tokens": 32109145.0, + "reward": 1.0593751668930054, + "reward_std": 0.04130847379565239, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1880.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 555.8125, + "completions/mean_terminated_length": 555.8125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.2950735104462213, + "grad_norm": 0.3248611092567444, + "kl": 0.0404052734375, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 32243839.0, + "reward": 1.1125000715255737, + "reward_std": 0.024803919717669487, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 536.2857666015625, + "completions/mean_terminated_length": 536.2857666015625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.2961052360072221, + "grad_norm": 0.49759235978126526, + "kl": 0.04815673828125, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 32372496.0, + "reward": 1.0812501907348633, + "reward_std": 0.06469383835792542, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1278.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 472.83038330078125, + "completions/mean_terminated_length": 472.83038330078125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.29713696156822283, + "grad_norm": 0.5727349519729614, + "kl": 0.0496826171875, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 32486038.0, + "reward": 1.1598215103149414, + "reward_std": 0.11348892003297806, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1318.0, + "completions/max_terminated_length": 1318.0, + "completions/mean_length": 533.2232666015625, + "completions/mean_terminated_length": 533.2232666015625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.2981686871292236, + "grad_norm": 0.5070824027061462, + "kl": 0.04302978515625, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 32608311.0, + "reward": 1.1187500953674316, + "reward_std": 0.07197443395853043, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.20774692296981812, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1372.0, + "completions/max_terminated_length": 1372.0, + "completions/mean_length": 558.1785888671875, + "completions/mean_terminated_length": 558.1785888671875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.2992004126902244, + "grad_norm": 0.5842003226280212, + "kl": 0.04473876953125, + "learning_rate": 1e-06, + "loss": -0.0222, + "num_tokens": 32738958.0, + "reward": 1.1062500476837158, + "reward_std": 0.09430962055921555, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 497.6607360839844, + "completions/mean_terminated_length": 497.6607360839844, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.3002321382512252, + "grad_norm": 0.46451354026794434, + "kl": 0.048583984375, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 32860475.0, + "reward": 1.1812500953674316, + "reward_std": 0.07882384210824966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.18125000596046448, + "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1110.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 529.5803833007812, + "completions/mean_terminated_length": 529.5803833007812, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.301263863812226, + "grad_norm": 0.5378194451332092, + "kl": 0.0421142578125, + "learning_rate": 1e-06, + "loss": 0.0276, + "num_tokens": 32988577.0, + "reward": 1.1218751668930054, + "reward_std": 0.0919036790728569, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1236.0, + "completions/max_terminated_length": 1236.0, + "completions/mean_length": 552.7678833007812, + "completions/mean_terminated_length": 552.7678833007812, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.3022955893732267, + "grad_norm": 0.562874436378479, + "kl": 0.0477294921875, + "learning_rate": 1e-06, + "loss": -0.0104, + "num_tokens": 33121754.0, + "reward": 1.1187500953674316, + "reward_std": 0.0919036939740181, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 558.5982666015625, + "completions/mean_terminated_length": 558.5982666015625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.3033273149342275, + "grad_norm": 0.5422093868255615, + "kl": 0.043212890625, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 33258496.0, + "reward": 1.1406251192092896, + "reward_std": 0.09532838314771652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1394.0, + "completions/max_terminated_length": 1394.0, + "completions/mean_length": 595.0535888671875, + "completions/mean_terminated_length": 595.0535888671875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.3043590404952283, + "grad_norm": 0.3412728011608124, + "kl": 0.0443115234375, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 33401547.0, + "reward": 1.0281250476837158, + "reward_std": 0.04371440038084984, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.02812499925494194, + "rewards/curriculum_aware_reward_fn/std": 0.09557347744703293, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1922.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 589.7142944335938, + "completions/mean_terminated_length": 589.7142944335938, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.30539076605622906, + "grad_norm": 0.46539467573165894, + "kl": 0.038818359375, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 33538547.0, + "reward": 1.1187500953674316, + "reward_std": 0.06611239165067673, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 515.857177734375, + "completions/mean_terminated_length": 515.857177734375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.30642249161722984, + "grad_norm": 0.8021560907363892, + "kl": 0.06085205078125, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 33662025.0, + "reward": 1.122321605682373, + "reward_std": 0.11312052607536316, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1223.0, + "completions/max_terminated_length": 1223.0, + "completions/mean_length": 570.482177734375, + "completions/mean_terminated_length": 570.482177734375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.30745421717823057, + "grad_norm": 0.542922854423523, + "kl": 0.049560546875, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 33788625.0, + "reward": 1.115625023841858, + "reward_std": 0.10017166286706924, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1130.0, + "completions/max_terminated_length": 1130.0, + "completions/mean_length": 528.2589721679688, + "completions/mean_terminated_length": 528.2589721679688, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.30848594273923136, + "grad_norm": 0.6580304503440857, + "kl": 0.07220458984375, + "learning_rate": 1e-06, + "loss": 0.0168, + "num_tokens": 33920737.0, + "reward": 1.1500000953674316, + "reward_std": 0.08465448766946793, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2114.0, + "completions/max_terminated_length": 2114.0, + "completions/mean_length": 544.6517944335938, + "completions/mean_terminated_length": 544.6517944335938, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.30951766830023214, + "grad_norm": 0.6650287508964539, + "kl": 0.051025390625, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 34046726.0, + "reward": 1.109375238418579, + "reward_std": 0.103996142745018, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.19388526678085327, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1616.0, + "completions/max_terminated_length": 1616.0, + "completions/mean_length": 582.7232666015625, + "completions/mean_terminated_length": 582.7232666015625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.3105493938612329, + "grad_norm": 0.527751624584198, + "kl": 0.042236328125, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 34189131.0, + "reward": 1.0687501430511475, + "reward_std": 0.06268768012523651, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1237.0, + "completions/max_terminated_length": 1237.0, + "completions/mean_length": 517.8660888671875, + "completions/mean_terminated_length": 517.8660888671875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.3115811194222337, + "grad_norm": 0.6737282872200012, + "kl": 0.05023193359375, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 34321521.0, + "reward": 1.0941965579986572, + "reward_std": 0.10627111792564392, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 510.2857360839844, + "completions/mean_terminated_length": 510.2857360839844, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.31261284498323444, + "grad_norm": 0.5042452216148376, + "kl": 0.05157470703125, + "learning_rate": 1e-06, + "loss": 0.0215, + "num_tokens": 34450024.0, + "reward": 1.1281250715255737, + "reward_std": 0.07055586576461792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1108.0, + "completions/max_terminated_length": 1108.0, + "completions/mean_length": 447.5625305175781, + "completions/mean_terminated_length": 447.5625305175781, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.3136445705442352, + "grad_norm": 0.6584708094596863, + "kl": 0.04644775390625, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 34562248.0, + "reward": 1.15625, + "reward_std": 0.08363571017980576, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.174774631857872, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 526.6964721679688, + "completions/mean_terminated_length": 526.6964721679688, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.314676296105236, + "grad_norm": 0.5943373441696167, + "kl": 0.0426025390625, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 34691525.0, + "reward": 1.125, + "reward_std": 0.10017166286706924, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 535.8660888671875, + "completions/mean_terminated_length": 535.8660888671875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.3157080216662368, + "grad_norm": 0.6603030562400818, + "kl": 0.04278564453125, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 34825035.0, + "reward": 1.1625001430511475, + "reward_std": 0.09674695134162903, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16249999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.21492718160152435, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 453.96429443359375, + "completions/mean_terminated_length": 453.96429443359375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.3167397472272376, + "grad_norm": 0.42629313468933105, + "kl": 0.04534912109375, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 34949120.0, + "reward": 1.109375, + "reward_std": 0.048189278692007065, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 425.7232360839844, + "completions/mean_terminated_length": 425.7232360839844, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.3177714727882383, + "grad_norm": 0.6072869896888733, + "kl": 0.04669189453125, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 35059094.0, + "reward": 1.1875, + "reward_std": 0.08261694759130478, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1143.0, + "completions/max_terminated_length": 1143.0, + "completions/mean_length": 514.5803833007812, + "completions/mean_terminated_length": 514.5803833007812, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.3188031983492391, + "grad_norm": 0.5873180627822876, + "kl": 0.0450439453125, + "learning_rate": 1e-06, + "loss": 0.017, + "num_tokens": 35190052.0, + "reward": 1.0906251668930054, + "reward_std": 0.10359636694192886, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 467.1964416503906, + "completions/mean_terminated_length": 467.1964416503906, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.3198349239102399, + "grad_norm": 0.57830411195755, + "kl": 0.044921875, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 35307163.0, + "reward": 1.0968750715255737, + "reward_std": 0.0856732651591301, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 451.2589416503906, + "completions/mean_terminated_length": 451.2589416503906, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.32086664947124066, + "grad_norm": 0.5586079955101013, + "kl": 0.0455322265625, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 35425695.0, + "reward": 1.0875000953674316, + "reward_std": 0.07678630203008652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1204.0, + "completions/max_terminated_length": 1204.0, + "completions/mean_length": 437.6875305175781, + "completions/mean_terminated_length": 437.6875305175781, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.32189837503224145, + "grad_norm": 0.4353128969669342, + "kl": 0.0452880859375, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 35544163.0, + "reward": 1.1593750715255737, + "reward_std": 0.043714407831430435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1153.0, + "completions/max_terminated_length": 1153.0, + "completions/mean_length": 487.2500305175781, + "completions/mean_terminated_length": 487.2500305175781, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.3229301005932422, + "grad_norm": 0.6254375576972961, + "kl": 0.0452880859375, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 35669680.0, + "reward": 1.0875000953674316, + "reward_std": 0.08949775248765945, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1118.0, + "completions/max_terminated_length": 1118.0, + "completions/mean_length": 426.1607360839844, + "completions/mean_terminated_length": 426.1607360839844, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.32396182615424296, + "grad_norm": 0.690932035446167, + "kl": 0.046630859375, + "learning_rate": 1e-06, + "loss": -0.0227, + "num_tokens": 35782979.0, + "reward": 1.1531250476837158, + "reward_std": 0.10054004937410355, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 464.46429443359375, + "completions/mean_terminated_length": 464.46429443359375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.32499355171524374, + "grad_norm": 0.603801429271698, + "kl": 0.04901123046875, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 35903214.0, + "reward": 1.1156251430511475, + "reward_std": 0.07678630203008652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 461.4464416503906, + "completions/mean_terminated_length": 461.4464416503906, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.32602527727624453, + "grad_norm": 0.5353335738182068, + "kl": 0.05169677734375, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 36021452.0, + "reward": 1.071874976158142, + "reward_std": 0.053001150488853455, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 448.4910888671875, + "completions/mean_terminated_length": 448.4910888671875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.3270570028372453, + "grad_norm": 0.6988222002983093, + "kl": 0.0484619140625, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 36139422.0, + "reward": 1.0812500715255737, + "reward_std": 0.0733615905046463, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 448.58929443359375, + "completions/mean_terminated_length": 448.58929443359375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.32808872839824604, + "grad_norm": 0.5886378288269043, + "kl": 0.0513916015625, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 36254652.0, + "reward": 1.1218751668930054, + "reward_std": 0.07740528881549835, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 404.02679443359375, + "completions/mean_terminated_length": 404.02679443359375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.32912045395924683, + "grad_norm": 0.7342789769172668, + "kl": 0.0582275390625, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 36366795.0, + "reward": 1.1437500715255737, + "reward_std": 0.08607304841279984, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 457.26788330078125, + "completions/mean_terminated_length": 457.26788330078125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.3301521795202476, + "grad_norm": 0.6931236982345581, + "kl": 0.04937744140625, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 36480680.0, + "reward": 1.1281250715255737, + "reward_std": 0.11288312077522278, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 397.5089416503906, + "completions/mean_terminated_length": 397.5089416503906, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.3311839050812484, + "grad_norm": 0.5426543354988098, + "kl": 0.04840087890625, + "learning_rate": 1e-06, + "loss": 0.0157, + "num_tokens": 36585583.0, + "reward": 1.1875, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.18152259290218353, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 462.1785888671875, + "completions/mean_terminated_length": 462.1785888671875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.3322156306422492, + "grad_norm": 0.5812787413597107, + "kl": 0.05181884765625, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 36709916.0, + "reward": 1.0660713911056519, + "reward_std": 0.08255477249622345, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 362.1339416503906, + "completions/mean_terminated_length": 362.1339416503906, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.3332473562032499, + "grad_norm": 0.5366288423538208, + "kl": 0.05487060546875, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 36814805.0, + "reward": 1.2156251668930054, + "reward_std": 0.07194303721189499, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2006780505180359, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1474.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 453.2500305175781, + "completions/mean_terminated_length": 453.2500305175781, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.3342790817642507, + "grad_norm": 0.4391082227230072, + "kl": 0.04986572265625, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 36939692.0, + "reward": 1.0906250476837158, + "reward_std": 0.057444632053375244, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 446.6160888671875, + "completions/mean_terminated_length": 446.6160888671875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.3353108073252515, + "grad_norm": 0.4519873857498169, + "kl": 0.04400634765625, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 37062268.0, + "reward": 1.0718750953674316, + "reward_std": 0.05161399021744728, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499850988388, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 420.3571472167969, + "completions/mean_terminated_length": 420.3571472167969, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.33634253288625227, + "grad_norm": 0.6282175183296204, + "kl": 0.05517578125, + "learning_rate": 1e-06, + "loss": -0.0215, + "num_tokens": 37172824.0, + "reward": 1.1343750953674316, + "reward_std": 0.08366710692644119, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 426.0446472167969, + "completions/mean_terminated_length": 426.0446472167969, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.33737425844725305, + "grad_norm": 0.7190306186676025, + "kl": 0.048095703125, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 37287132.0, + "reward": 1.1093751192092896, + "reward_std": 0.09674695134162903, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1093749925494194, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 426.4285888671875, + "completions/mean_terminated_length": 426.4285888671875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.3384059840082538, + "grad_norm": 0.6656579971313477, + "kl": 0.0509033203125, + "learning_rate": 1e-06, + "loss": 0.0365, + "num_tokens": 37400594.0, + "reward": 1.1156251430511475, + "reward_std": 0.09572818130254745, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 410.7946472167969, + "completions/mean_terminated_length": 410.7946472167969, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.33943770956925456, + "grad_norm": 0.48774799704551697, + "kl": 0.05328369140625, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 37515910.0, + "reward": 1.09375, + "reward_std": 0.037883758544921875, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569154918193817, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 409.4107360839844, + "completions/mean_terminated_length": 409.4107360839844, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.34046943513025535, + "grad_norm": 0.3467229902744293, + "kl": 0.04986572265625, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 37633836.0, + "reward": 1.1437500715255737, + "reward_std": 0.028228627517819405, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 447.71429443359375, + "completions/mean_terminated_length": 447.71429443359375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.34150116069125613, + "grad_norm": 0.6817765831947327, + "kl": 0.047607421875, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 37750193.0, + "reward": 1.1000001430511475, + "reward_std": 0.1166761964559555, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 474.5625305175781, + "completions/mean_terminated_length": 474.5625305175781, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.3425328862522569, + "grad_norm": 0.523029088973999, + "kl": 0.0455322265625, + "learning_rate": 1e-06, + "loss": 0.0182, + "num_tokens": 37887736.0, + "reward": 1.1281250715255737, + "reward_std": 0.07777367532253265, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 434.5714416503906, + "completions/mean_terminated_length": 434.5714416503906, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.34356461181325765, + "grad_norm": 0.7264805436134338, + "kl": 0.047607421875, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 38011224.0, + "reward": 1.1406251192092896, + "reward_std": 0.10600230097770691, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 473.8839416503906, + "completions/mean_terminated_length": 473.8839416503906, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.34459633737425843, + "grad_norm": 0.45366787910461426, + "kl": 0.04718017578125, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 38137323.0, + "reward": 1.0250000953674316, + "reward_std": 0.04476457089185715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.02500000037252903, + "rewards/curriculum_aware_reward_fn/std": 0.09054389595985413, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 481.1785888671875, + "completions/mean_terminated_length": 481.1785888671875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.3456280629352592, + "grad_norm": 0.304663747549057, + "kl": 0.04620361328125, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 38266486.0, + "reward": 1.0250000953674316, + "reward_std": 0.024803917855024338, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.02500000037252903, + "rewards/curriculum_aware_reward_fn/std": 0.09054390341043472, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1294.0, + "completions/max_terminated_length": 1294.0, + "completions/mean_length": 418.5535888671875, + "completions/mean_terminated_length": 418.5535888671875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.34665978849626, + "grad_norm": 0.7588237524032593, + "kl": 0.05780029296875, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 38373186.0, + "reward": 1.1531251668930054, + "reward_std": 0.11087695509195328, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 435.4196472167969, + "completions/mean_terminated_length": 435.4196472167969, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.3476915140572608, + "grad_norm": 0.7718022465705872, + "kl": 0.052001953125, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 38499065.0, + "reward": 1.1156251430511475, + "reward_std": 0.1225382462143898, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 429.45538330078125, + "completions/mean_terminated_length": 429.45538330078125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.3487232396182615, + "grad_norm": 0.5799484252929688, + "kl": 0.04791259765625, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 38621035.0, + "reward": 1.1031250953674316, + "reward_std": 0.0612691193819046, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 433.4821472167969, + "completions/mean_terminated_length": 433.4821472167969, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.3497549651792623, + "grad_norm": 0.7384994626045227, + "kl": 0.0521240234375, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 38731262.0, + "reward": 1.0718750953674316, + "reward_std": 0.08986613154411316, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 432.7500305175781, + "completions/mean_terminated_length": 432.7500305175781, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.3507866907402631, + "grad_norm": 0.6898339986801147, + "kl": 0.04461669921875, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 38847798.0, + "reward": 1.0875000953674316, + "reward_std": 0.0898975357413292, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 418.8214416503906, + "completions/mean_terminated_length": 418.8214416503906, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.35181841630126387, + "grad_norm": 0.5710652470588684, + "kl": 0.05328369140625, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 38960093.0, + "reward": 1.0843751430511475, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 442.3482360839844, + "completions/mean_terminated_length": 442.3482360839844, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.35285014186226465, + "grad_norm": 0.6075477004051208, + "kl": 0.047119140625, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 39076675.0, + "reward": 1.068750023841858, + "reward_std": 0.08363571763038635, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 399.6339416503906, + "completions/mean_terminated_length": 399.6339416503906, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.3538818674232654, + "grad_norm": 0.5659580230712891, + "kl": 0.057861328125, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 39185750.0, + "reward": 1.1312501430511475, + "reward_std": 0.05988196283578873, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 366.3214416503906, + "completions/mean_terminated_length": 366.3214416503906, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.35491359298426617, + "grad_norm": 0.6755393147468567, + "kl": 0.05224609375, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 39287580.0, + "reward": 1.1875, + "reward_std": 0.08122977614402771, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 422.0446472167969, + "completions/mean_terminated_length": 422.0446472167969, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.35594531854526695, + "grad_norm": 0.6563684940338135, + "kl": 0.04205322265625, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 39414908.0, + "reward": 1.084375023841858, + "reward_std": 0.08159816265106201, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 420.4732360839844, + "completions/mean_terminated_length": 420.4732360839844, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.35697704410626774, + "grad_norm": 0.6138877272605896, + "kl": 0.05419921875, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 39524665.0, + "reward": 1.1379464864730835, + "reward_std": 0.09797175228595734, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 401.0357360839844, + "completions/mean_terminated_length": 401.0357360839844, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.3580087696672685, + "grad_norm": 0.7666031718254089, + "kl": 0.065185546875, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 39637208.0, + "reward": 1.1468751430511475, + "reward_std": 0.086041659116745, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 466.33929443359375, + "completions/mean_terminated_length": 466.33929443359375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.3590404952282693, + "grad_norm": 0.5586252808570862, + "kl": 0.0443115234375, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 39763442.0, + "reward": 1.140625238418579, + "reward_std": 0.07536774128675461, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 407.15179443359375, + "completions/mean_terminated_length": 407.15179443359375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.36007222078927004, + "grad_norm": 0.719732940196991, + "kl": 0.0467529296875, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 39870685.0, + "reward": 1.1656252145767212, + "reward_std": 0.10359636694192886, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16562499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17553408443927765, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 421.9464416503906, + "completions/mean_terminated_length": 421.9464416503906, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.3611039463502708, + "grad_norm": 0.598708987236023, + "kl": 0.0479736328125, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 39988958.0, + "reward": 1.1156251430511475, + "reward_std": 0.08949775248765945, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1080.0, + "completions/max_terminated_length": 1080.0, + "completions/mean_length": 420.8214416503906, + "completions/mean_terminated_length": 420.8214416503906, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.3621356719112716, + "grad_norm": 0.6411553621292114, + "kl": 0.04693603515625, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 40104039.0, + "reward": 1.1375000476837158, + "reward_std": 0.08505427092313766, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 398.6339416503906, + "completions/mean_terminated_length": 398.6339416503906, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.3631673974722724, + "grad_norm": 0.6558811068534851, + "kl": 0.0516357421875, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 40223796.0, + "reward": 1.125, + "reward_std": 0.08261694014072418, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 433.3035888671875, + "completions/mean_terminated_length": 433.3035888671875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.3641991230332732, + "grad_norm": 0.6025667190551758, + "kl": 0.0465087890625, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 40333849.0, + "reward": 1.0875000953674316, + "reward_std": 0.05645725876092911, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 419.1964416503906, + "completions/mean_terminated_length": 419.1964416503906, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3652308485942739, + "grad_norm": 0.6397855281829834, + "kl": 0.0496826171875, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 40460648.0, + "reward": 1.0968750715255737, + "reward_std": 0.08363571763038635, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.15729717910289764, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 442.5714416503906, + "completions/mean_terminated_length": 442.5714416503906, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.3662625741552747, + "grad_norm": 0.6532189249992371, + "kl": 0.0506591796875, + "learning_rate": 1e-06, + "loss": -0.0114, + "num_tokens": 40579999.0, + "reward": 1.125000238418579, + "reward_std": 0.08607304841279984, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1249999925494194, + "rewards/curriculum_aware_reward_fn/std": 0.16845881938934326, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 414.6875305175781, + "completions/mean_terminated_length": 414.6875305175781, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.3672942997162755, + "grad_norm": 0.4674402177333832, + "kl": 0.0479736328125, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 40697165.0, + "reward": 1.1000001430511475, + "reward_std": 0.05198238044977188, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 442.8214416503906, + "completions/mean_terminated_length": 442.8214416503906, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.36832602527727626, + "grad_norm": 0.6097815036773682, + "kl": 0.04449462890625, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 40815214.0, + "reward": 1.0656250715255737, + "reward_std": 0.07539913058280945, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.13722331821918488, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 422.1964416503906, + "completions/mean_terminated_length": 422.1964416503906, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.36935775083827704, + "grad_norm": 0.7170711755752563, + "kl": 0.04144287109375, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 40934663.0, + "reward": 1.1348215341567993, + "reward_std": 0.11864346265792847, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 410.08038330078125, + "completions/mean_terminated_length": 410.08038330078125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.3703894763992778, + "grad_norm": 0.6699801683425903, + "kl": 0.0465087890625, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 41050795.0, + "reward": 1.1531251668930054, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 436.4821472167969, + "completions/mean_terminated_length": 436.4821472167969, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.37142120196027856, + "grad_norm": 0.6288172006607056, + "kl": 0.05291748046875, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 41168393.0, + "reward": 1.087499976158142, + "reward_std": 0.08224854618310928, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 444.3214416503906, + "completions/mean_terminated_length": 444.3214416503906, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.37245292752127934, + "grad_norm": 0.5419376492500305, + "kl": 0.04608154296875, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 41291272.0, + "reward": 1.1281250715255737, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 395.21429443359375, + "completions/mean_terminated_length": 395.21429443359375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.3734846530822801, + "grad_norm": 0.6662964224815369, + "kl": 0.0462646484375, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 41401840.0, + "reward": 1.125, + "reward_std": 0.09252267330884933, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 442.982177734375, + "completions/mean_terminated_length": 442.982177734375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.3745163786432809, + "grad_norm": 0.7266864776611328, + "kl": 0.0445556640625, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 41528853.0, + "reward": 1.0875000953674316, + "reward_std": 0.08505427092313766, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223556756973267, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 421.4821472167969, + "completions/mean_terminated_length": 421.4821472167969, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.37554810420428164, + "grad_norm": 0.6591650247573853, + "kl": 0.05230712890625, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 41637421.0, + "reward": 1.115625023841858, + "reward_std": 0.0860416442155838, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1655.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 468.1607360839844, + "completions/mean_terminated_length": 468.1607360839844, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.3765798297652824, + "grad_norm": 0.6517752408981323, + "kl": 0.0452880859375, + "learning_rate": 1e-06, + "loss": 0.0157, + "num_tokens": 41759204.0, + "reward": 1.09375, + "reward_std": 0.08363571017980576, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0937499925494194, + "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 388.4910888671875, + "completions/mean_terminated_length": 388.4910888671875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.3776115553262832, + "grad_norm": 0.6884093880653381, + "kl": 0.04962158203125, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 41866282.0, + "reward": 1.1406251192092896, + "reward_std": 0.09918428212404251, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.20724830031394958, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 403.2857360839844, + "completions/mean_terminated_length": 403.2857360839844, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.378643280887284, + "grad_norm": 0.60127854347229, + "kl": 0.04571533203125, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 41973687.0, + "reward": 1.1250001192092896, + "reward_std": 0.0798112154006958, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 410.4910888671875, + "completions/mean_terminated_length": 410.4910888671875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.3796750064482848, + "grad_norm": 0.6465752720832825, + "kl": 0.0518798828125, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 42083173.0, + "reward": 1.1343750953674316, + "reward_std": 0.08949775993824005, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.1893601268529892, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1342.0, + "completions/max_terminated_length": 1342.0, + "completions/mean_length": 394.1696472167969, + "completions/mean_terminated_length": 394.1696472167969, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.3807067320092855, + "grad_norm": 0.7463873624801636, + "kl": 0.05291748046875, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 42187520.0, + "reward": 1.1437500715255737, + "reward_std": 0.10257759690284729, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 434.0000305175781, + "completions/mean_terminated_length": 434.0000305175781, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.3817384575702863, + "grad_norm": 0.7016083598136902, + "kl": 0.059326171875, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 42313065.0, + "reward": 1.1156251430511475, + "reward_std": 0.10297737270593643, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.17820820212364197, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 412.64288330078125, + "completions/mean_terminated_length": 412.64288330078125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.3827701831312871, + "grad_norm": 0.6370155811309814, + "kl": 0.0504150390625, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 42424458.0, + "reward": 1.1281250715255737, + "reward_std": 0.08326732367277145, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1306.0, + "completions/max_terminated_length": 1306.0, + "completions/mean_length": 379.7946472167969, + "completions/mean_terminated_length": 379.7946472167969, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.38380190869228786, + "grad_norm": 0.7614575624465942, + "kl": 0.0594482421875, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 42538150.0, + "reward": 1.1812502145767212, + "reward_std": 0.09532840549945831, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.21520207822322845, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1313.0, + "completions/max_terminated_length": 1313.0, + "completions/mean_length": 464.3839416503906, + "completions/mean_terminated_length": 464.3839416503906, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.38483363425328865, + "grad_norm": 0.4783412516117096, + "kl": 0.04736328125, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 42657277.0, + "reward": 1.1125000715255737, + "reward_std": 0.0660809874534607, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 404.8035888671875, + "completions/mean_terminated_length": 404.8035888671875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.3858653598142894, + "grad_norm": 0.5575130581855774, + "kl": 0.05108642578125, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 42764404.0, + "reward": 1.1468751430511475, + "reward_std": 0.0671311616897583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 421.3660888671875, + "completions/mean_terminated_length": 421.3660888671875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.38689708537529016, + "grad_norm": 0.7298808097839355, + "kl": 0.050048828125, + "learning_rate": 1e-06, + "loss": 0.0233, + "num_tokens": 42874625.0, + "reward": 1.1316965818405151, + "reward_std": 0.10944001376628876, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 387.9196472167969, + "completions/mean_terminated_length": 387.9196472167969, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.38792881093629095, + "grad_norm": 0.5968577265739441, + "kl": 0.05230712890625, + "learning_rate": 1e-06, + "loss": 0.0209, + "num_tokens": 42991649.0, + "reward": 1.1531251668930054, + "reward_std": 0.07055586576461792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 423.77679443359375, + "completions/mean_terminated_length": 423.77679443359375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.38896053649729173, + "grad_norm": 0.5788670778274536, + "kl": 0.04803466796875, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 43110316.0, + "reward": 1.0843751430511475, + "reward_std": 0.07333019375801086, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 373.6875305175781, + "completions/mean_terminated_length": 373.6875305175781, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.3899922620582925, + "grad_norm": 0.7469679713249207, + "kl": 0.0487060546875, + "learning_rate": 1e-06, + "loss": 0.032, + "num_tokens": 43217570.0, + "reward": 1.171875, + "reward_std": 0.1025775894522667, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.175758495926857, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 821.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 415.3482360839844, + "completions/mean_terminated_length": 415.3482360839844, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.39102398761929325, + "grad_norm": 0.4806562662124634, + "kl": 0.0462646484375, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 43334964.0, + "reward": 1.0968750715255737, + "reward_std": 0.053001150488853455, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 386.6607360839844, + "completions/mean_terminated_length": 386.6607360839844, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.39205571318029403, + "grad_norm": 0.6967973113059998, + "kl": 0.04705810546875, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 43446520.0, + "reward": 1.1191965341567993, + "reward_std": 0.09760335832834244, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.1878974735736847, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 383.1964416503906, + "completions/mean_terminated_length": 383.1964416503906, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.3930874387412948, + "grad_norm": 0.5589176416397095, + "kl": 0.05255126953125, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 43557282.0, + "reward": 1.1156251430511475, + "reward_std": 0.0612691231071949, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 348.3482360839844, + "completions/mean_terminated_length": 348.3482360839844, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.3941191643022956, + "grad_norm": 0.6366583108901978, + "kl": 0.049072265625, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 43657960.0, + "reward": 1.1437500715255737, + "reward_std": 0.08122977614402771, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 442.58038330078125, + "completions/mean_terminated_length": 442.58038330078125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.3951508898632964, + "grad_norm": 0.5829880833625793, + "kl": 0.04364013671875, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 43777165.0, + "reward": 1.0437501668930054, + "reward_std": 0.07678630203008652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.04374999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.11627185344696045, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 418.8750305175781, + "completions/mean_terminated_length": 418.8750305175781, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.3961826154242971, + "grad_norm": 0.5630151033401489, + "kl": 0.05035400390625, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 43896165.0, + "reward": 1.1125000715255737, + "reward_std": 0.07536774128675461, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 396.4375305175781, + "completions/mean_terminated_length": 396.4375305175781, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.3972143409852979, + "grad_norm": 0.6741788983345032, + "kl": 0.0452880859375, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 44004406.0, + "reward": 1.21875, + "reward_std": 0.07194302976131439, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21875, + "rewards/curriculum_aware_reward_fn/std": 0.21076062321662903, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 432.4285888671875, + "completions/mean_terminated_length": 432.4285888671875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.3982460665462987, + "grad_norm": 0.6056326031684875, + "kl": 0.04156494140625, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 44116167.0, + "reward": 1.109375, + "reward_std": 0.08986614644527435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1093749925494194, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 388.2321472167969, + "completions/mean_terminated_length": 388.2321472167969, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.39927779210729947, + "grad_norm": 0.7883094549179077, + "kl": 0.04345703125, + "learning_rate": 1e-06, + "loss": -0.0217, + "num_tokens": 44229866.0, + "reward": 1.1218751668930054, + "reward_std": 0.09332224726676941, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 469.39288330078125, + "completions/mean_terminated_length": 469.39288330078125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.40030951766830025, + "grad_norm": 0.6451113224029541, + "kl": 0.04473876953125, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 44351031.0, + "reward": 1.109375, + "reward_std": 0.10396476089954376, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 384.70538330078125, + "completions/mean_terminated_length": 384.70538330078125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.401341243229301, + "grad_norm": 0.5290384888648987, + "kl": 0.0445556640625, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 44461283.0, + "reward": 1.0968750715255737, + "reward_std": 0.05682564526796341, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.15729717910289764, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 406.6607360839844, + "completions/mean_terminated_length": 406.6607360839844, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.40237296879030177, + "grad_norm": 0.7036343216896057, + "kl": 0.04498291015625, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 44579237.0, + "reward": 1.1312501430511475, + "reward_std": 0.10017165541648865, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13125000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 452.5982360839844, + "completions/mean_terminated_length": 452.5982360839844, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.40340469435130255, + "grad_norm": 0.6781396865844727, + "kl": 0.045166015625, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 44694398.0, + "reward": 1.125, + "reward_std": 0.07398058474063873, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845881938934326, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 434.8750305175781, + "completions/mean_terminated_length": 434.8750305175781, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.40443641991230334, + "grad_norm": 0.5618754029273987, + "kl": 0.04437255859375, + "learning_rate": 1e-06, + "loss": 0.012, + "num_tokens": 44814735.0, + "reward": 1.084375023841858, + "reward_std": 0.07536774128675461, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 435.3214416503906, + "completions/mean_terminated_length": 402.34234619140625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.4054681454733041, + "grad_norm": 0.5044761896133423, + "kl": 0.0457763671875, + "learning_rate": 1e-06, + "loss": 0.0414, + "num_tokens": 44926540.0, + "reward": 1.1254465579986572, + "reward_std": 0.07346688210964203, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 364.8125305175781, + "completions/mean_terminated_length": 364.8125305175781, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.40649987103430485, + "grad_norm": 0.6403542160987854, + "kl": 0.04803466796875, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 45029475.0, + "reward": 1.1000001430511475, + "reward_std": 0.07194302976131439, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 370.83929443359375, + "completions/mean_terminated_length": 370.83929443359375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.40753159659530563, + "grad_norm": 0.6687615513801575, + "kl": 0.054443359375, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 45142542.0, + "reward": 1.1125000715255737, + "reward_std": 0.08122977614402771, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11249999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 417.45538330078125, + "completions/mean_terminated_length": 417.45538330078125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.4085633221563064, + "grad_norm": 0.5757293701171875, + "kl": 0.049072265625, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 45262853.0, + "reward": 1.1593750715255737, + "reward_std": 0.07333019375801086, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15937498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 379.33038330078125, + "completions/mean_terminated_length": 379.33038330078125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.4095950477173072, + "grad_norm": 0.6369082927703857, + "kl": 0.04974365234375, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 45373413.0, + "reward": 1.1593750715255737, + "reward_std": 0.06851832568645477, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15937498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 394.9375305175781, + "completions/mean_terminated_length": 394.9375305175781, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.410626773278308, + "grad_norm": 0.5108514428138733, + "kl": 0.0435791015625, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 45486106.0, + "reward": 1.068750023841858, + "reward_std": 0.058463405817747116, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 412.3660888671875, + "completions/mean_terminated_length": 412.3660888671875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.4116584988393087, + "grad_norm": 0.6662240624427795, + "kl": 0.05010986328125, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 45593408.0, + "reward": 1.1218751668930054, + "reward_std": 0.08949775993824005, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 415.9821472167969, + "completions/mean_terminated_length": 415.9821472167969, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.4126902244003095, + "grad_norm": 0.5813707709312439, + "kl": 0.051513671875, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 45706981.0, + "reward": 1.1406251192092896, + "reward_std": 0.07194302976131439, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1422.0, + "completions/max_terminated_length": 1422.0, + "completions/mean_length": 455.357177734375, + "completions/mean_terminated_length": 455.357177734375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.4137219499613103, + "grad_norm": 0.5764865875244141, + "kl": 0.04718017578125, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 45838670.0, + "reward": 1.1125001907348633, + "reward_std": 0.06611239165067673, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11249998956918716, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 360.45538330078125, + "completions/mean_terminated_length": 360.45538330078125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.41475367552231107, + "grad_norm": 0.6881871819496155, + "kl": 0.052978515625, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 45948992.0, + "reward": 1.0968750715255737, + "reward_std": 0.07780507206916809, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 426.0714416503906, + "completions/mean_terminated_length": 426.0714416503906, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.41578540108331186, + "grad_norm": 0.6441931128501892, + "kl": 0.04095458984375, + "learning_rate": 1e-06, + "loss": 0.024, + "num_tokens": 46063625.0, + "reward": 1.1656250953674316, + "reward_std": 0.10079064220190048, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16562499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17553408443927765, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 416.77679443359375, + "completions/mean_terminated_length": 416.77679443359375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.4168171266443126, + "grad_norm": 0.5703777074813843, + "kl": 0.04595947265625, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 46175761.0, + "reward": 1.09375, + "reward_std": 0.06851832568645477, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 399.0000305175781, + "completions/mean_terminated_length": 399.0000305175781, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.41784885220531337, + "grad_norm": 0.6118718385696411, + "kl": 0.05035400390625, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 46293018.0, + "reward": 1.100000023841858, + "reward_std": 0.08706042170524597, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 433.4285888671875, + "completions/mean_terminated_length": 433.4285888671875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.41888057776631415, + "grad_norm": 0.5196507573127747, + "kl": 0.05023193359375, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 46419367.0, + "reward": 1.1031250953674316, + "reward_std": 0.07842406630516052, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10312499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16027583181858063, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 383.96429443359375, + "completions/mean_terminated_length": 383.96429443359375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.41991230332731494, + "grad_norm": 0.5900987386703491, + "kl": 0.0543212890625, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 46535521.0, + "reward": 1.118749976158142, + "reward_std": 0.0612691231071949, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.16645820438861847, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 380.39288330078125, + "completions/mean_terminated_length": 380.39288330078125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.4209440288883157, + "grad_norm": 0.8150759935379028, + "kl": 0.049560546875, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 46639949.0, + "reward": 1.1750000715255737, + "reward_std": 0.11532045155763626, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17578651010990143, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 399.0357360839844, + "completions/mean_terminated_length": 399.0357360839844, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.4219757544493165, + "grad_norm": 0.5155802369117737, + "kl": 0.05224609375, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 46756118.0, + "reward": 1.1375001668930054, + "reward_std": 0.049576446413993835, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170318961143494, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 467.6607360839844, + "completions/mean_terminated_length": 467.6607360839844, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.42300748001031724, + "grad_norm": 0.33023545145988464, + "kl": 0.04486083984375, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 46890669.0, + "reward": 1.0406250953674316, + "reward_std": 0.028228627517819405, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.04062500223517418, + "rewards/curriculum_aware_reward_fn/std": 0.11261254549026489, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 419.0625305175781, + "completions/mean_terminated_length": 419.0625305175781, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.424039205571318, + "grad_norm": 0.6532723903656006, + "kl": 0.04840087890625, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 47002921.0, + "reward": 1.1093751192092896, + "reward_std": 0.06509362161159515, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 448.02679443359375, + "completions/mean_terminated_length": 448.02679443359375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.4250709311323188, + "grad_norm": 0.7180759310722351, + "kl": 0.04180908203125, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 47114567.0, + "reward": 1.1218750476837158, + "reward_std": 0.08949775993824005, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 410.9464416503906, + "completions/mean_terminated_length": 410.9464416503906, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.4261026566933196, + "grad_norm": 0.7026389837265015, + "kl": 0.05511474609375, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 47221150.0, + "reward": 1.0625, + "reward_std": 0.08746020495891571, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0625, + "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 392.1160888671875, + "completions/mean_terminated_length": 392.1160888671875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.4271343822543204, + "grad_norm": 0.6177908778190613, + "kl": 0.05438232421875, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 47332386.0, + "reward": 1.0687501430511475, + "reward_std": 0.06851831823587418, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 415.9285888671875, + "completions/mean_terminated_length": 415.9285888671875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.4281661078153211, + "grad_norm": 0.6898444890975952, + "kl": 0.05059814453125, + "learning_rate": 1e-06, + "loss": 0.0142, + "num_tokens": 47435460.0, + "reward": 1.1375001668930054, + "reward_std": 0.08261694759130478, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1771.0, + "completions/max_terminated_length": 1771.0, + "completions/mean_length": 506.02679443359375, + "completions/mean_terminated_length": 506.02679443359375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.4291978333763219, + "grad_norm": 0.42713573575019836, + "kl": 0.03955078125, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 47563752.0, + "reward": 1.0973215103149414, + "reward_std": 0.07421800494194031, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 351.3482360839844, + "completions/mean_terminated_length": 351.3482360839844, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.4302295589373227, + "grad_norm": 0.7151590585708618, + "kl": 0.0460205078125, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 47663464.0, + "reward": 1.2000000476837158, + "reward_std": 0.07882384210824966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20000000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 401.89288330078125, + "completions/mean_terminated_length": 401.89288330078125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.43126128449832346, + "grad_norm": 0.5175272822380066, + "kl": 0.04498291015625, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 47777069.0, + "reward": 1.1281250715255737, + "reward_std": 0.0550387017428875, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 351.7589416503906, + "completions/mean_terminated_length": 351.7589416503906, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.43229301005932425, + "grad_norm": 0.703593373298645, + "kl": 0.05059814453125, + "learning_rate": 1e-06, + "loss": 0.0221, + "num_tokens": 47876050.0, + "reward": 1.0687501430511475, + "reward_std": 0.10359635949134827, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06875000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1396786868572235, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 360.8660888671875, + "completions/mean_terminated_length": 360.8660888671875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.433324735620325, + "grad_norm": 0.35815978050231934, + "kl": 0.05181884765625, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 47988007.0, + "reward": 1.109375, + "reward_std": 0.03063456155359745, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1093749925494194, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 363.7321472167969, + "completions/mean_terminated_length": 363.7321472167969, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.43435646118132576, + "grad_norm": 0.557057797908783, + "kl": 0.05450439453125, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 48093068.0, + "reward": 1.140625, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 416.45538330078125, + "completions/mean_terminated_length": 416.45538330078125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.43538818674232654, + "grad_norm": 0.692194938659668, + "kl": 0.053466796875, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 48214303.0, + "reward": 1.0781251192092896, + "reward_std": 0.08224855363368988, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 408.3750305175781, + "completions/mean_terminated_length": 408.3750305175781, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.43641991230332733, + "grad_norm": 0.7828994989395142, + "kl": 0.0457763671875, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 48330979.0, + "reward": 1.109375, + "reward_std": 0.09332224726676941, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1093749925494194, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 377.1071472167969, + "completions/mean_terminated_length": 377.1071472167969, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.4374516378643281, + "grad_norm": 0.6373588442802429, + "kl": 0.0523681640625, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 48437398.0, + "reward": 1.1406251192092896, + "reward_std": 0.09292246401309967, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 375.3214416503906, + "completions/mean_terminated_length": 375.3214416503906, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.43848336342532884, + "grad_norm": 0.7247622013092041, + "kl": 0.05340576171875, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 48539928.0, + "reward": 1.134374976158142, + "reward_std": 0.10119043290615082, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 347.1785888671875, + "completions/mean_terminated_length": 347.1785888671875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.4395150889863296, + "grad_norm": 0.6451843976974487, + "kl": 0.07763671875, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 48639782.0, + "reward": 1.137946605682373, + "reward_std": 0.09939030557870865, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 407.8571472167969, + "completions/mean_terminated_length": 407.8571472167969, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.4405468145473304, + "grad_norm": 0.558600902557373, + "kl": 0.04937744140625, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 48755301.0, + "reward": 1.0718750953674316, + "reward_std": 0.06472522765398026, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 366.6875305175781, + "completions/mean_terminated_length": 366.6875305175781, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.4415785401083312, + "grad_norm": 0.6736543774604797, + "kl": 0.0601806640625, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 48857872.0, + "reward": 1.1625001430511475, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 312.3571472167969, + "completions/mean_terminated_length": 312.3571472167969, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.442610265669332, + "grad_norm": 0.38957372307777405, + "kl": 0.063720703125, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 48957844.0, + "reward": 1.1968752145767212, + "reward_std": 0.027209853753447533, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 382.4196472167969, + "completions/mean_terminated_length": 382.4196472167969, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.4436419912303327, + "grad_norm": 0.6348445415496826, + "kl": 0.0562744140625, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 49072579.0, + "reward": 1.140625, + "reward_std": 0.07398059219121933, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 364.08929443359375, + "completions/mean_terminated_length": 364.08929443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.4446737167913335, + "grad_norm": 0.6346526741981506, + "kl": 0.06121826171875, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 49181859.0, + "reward": 1.0968750715255737, + "reward_std": 0.0671311616897583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 357.46429443359375, + "completions/mean_terminated_length": 357.46429443359375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.4457054423523343, + "grad_norm": 0.5235685706138611, + "kl": 0.0638427734375, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 49284629.0, + "reward": 1.1343750953674316, + "reward_std": 0.05059521645307541, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 389.2232360839844, + "completions/mean_terminated_length": 389.2232360839844, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.44673716791333506, + "grad_norm": 0.5660263299942017, + "kl": 0.06060791015625, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 49394883.0, + "reward": 1.09375, + "reward_std": 0.07157464325428009, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 340.8035888671875, + "completions/mean_terminated_length": 340.8035888671875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.44776889347433585, + "grad_norm": 0.7160601019859314, + "kl": 0.06365966796875, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 49507484.0, + "reward": 1.15625, + "reward_std": 0.08264832943677902, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.174774631857872, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 356.7321472167969, + "completions/mean_terminated_length": 356.7321472167969, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.4488006190353366, + "grad_norm": 0.6561874747276306, + "kl": 0.06512451171875, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 49609703.0, + "reward": 1.1500002145767212, + "reward_std": 0.0671311616897583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 326.8660888671875, + "completions/mean_terminated_length": 326.8660888671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.44983234459633736, + "grad_norm": 0.6523354649543762, + "kl": 0.076416015625, + "learning_rate": 1e-06, + "loss": -0.0173, + "num_tokens": 49710334.0, + "reward": 1.15625, + "reward_std": 0.07780507951974869, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.174774631857872, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 367.8750305175781, + "completions/mean_terminated_length": 367.8750305175781, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.45086407015733815, + "grad_norm": 0.7713767886161804, + "kl": 0.06683349609375, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 49819765.0, + "reward": 1.1035715341567993, + "reward_std": 0.1014278456568718, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.11250000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 349.40179443359375, + "completions/mean_terminated_length": 349.40179443359375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.45189579571833893, + "grad_norm": 0.7261008024215698, + "kl": 0.0672607421875, + "learning_rate": 1e-06, + "loss": -0.006, + "num_tokens": 49922080.0, + "reward": 1.1343750953674316, + "reward_std": 0.09394123405218124, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 378.33929443359375, + "completions/mean_terminated_length": 378.33929443359375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.4529275212793397, + "grad_norm": 0.5196535587310791, + "kl": 0.0665283203125, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 50029519.0, + "reward": 1.0750000476837158, + "reward_std": 0.05543847754597664, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 379.6785888671875, + "completions/mean_terminated_length": 379.6785888671875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.45395924684034045, + "grad_norm": 0.6472377181053162, + "kl": 0.0679931640625, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 50147587.0, + "reward": 1.0875000953674316, + "reward_std": 0.06228789687156677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 342.9464416503906, + "completions/mean_terminated_length": 342.9464416503906, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.45499097240134123, + "grad_norm": 0.5558903217315674, + "kl": 0.0648193359375, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 50249193.0, + "reward": 1.1625001430511475, + "reward_std": 0.04957644268870354, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 325.4375, + "completions/mean_terminated_length": 325.4375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.456022697962342, + "grad_norm": 0.6881309151649475, + "kl": 0.0703125, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 50343511.0, + "reward": 1.1406251192092896, + "reward_std": 0.05543847754597664, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 349.6250305175781, + "completions/mean_terminated_length": 349.6250305175781, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.4570544235233428, + "grad_norm": 0.6663326025009155, + "kl": 0.0712890625, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 50444444.0, + "reward": 1.1750001907348633, + "reward_std": 0.07678630203008652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17578651010990143, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 349.6607360839844, + "completions/mean_terminated_length": 349.6607360839844, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.4580861490843436, + "grad_norm": 0.5423421263694763, + "kl": 0.07086181640625, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 50546214.0, + "reward": 1.109375, + "reward_std": 0.07092426717281342, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.16295845806598663, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 373.6160888671875, + "completions/mean_terminated_length": 373.6160888671875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.4591178746453443, + "grad_norm": 0.7760041356086731, + "kl": 0.0716552734375, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 50657149.0, + "reward": 1.1500000953674316, + "reward_std": 0.08986614644527435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 373.5535888671875, + "completions/mean_terminated_length": 373.5535888671875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.4601496002063451, + "grad_norm": 0.6723666191101074, + "kl": 0.06146240234375, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 50770681.0, + "reward": 1.0718750953674316, + "reward_std": 0.059231583029031754, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 393.21429443359375, + "completions/mean_terminated_length": 393.21429443359375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.4611813257673459, + "grad_norm": 0.5650410056114197, + "kl": 0.06182861328125, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 50888795.0, + "reward": 1.071874976158142, + "reward_std": 0.06506221741437912, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07187499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.1420222669839859, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 367.8571472167969, + "completions/mean_terminated_length": 367.8571472167969, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.46221305132834667, + "grad_norm": 0.616245687007904, + "kl": 0.05908203125, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 50997048.0, + "reward": 1.1593750715255737, + "reward_std": 0.06953710317611694, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 417.02679443359375, + "completions/mean_terminated_length": 417.02679443359375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.46324477688934745, + "grad_norm": 0.5801323652267456, + "kl": 0.0625, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 51112964.0, + "reward": 1.1187500953674316, + "reward_std": 0.05543848127126694, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 407.3660888671875, + "completions/mean_terminated_length": 407.3660888671875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.4642765024503482, + "grad_norm": 0.7780681252479553, + "kl": 0.05841064453125, + "learning_rate": 1e-06, + "loss": -0.0238, + "num_tokens": 51234644.0, + "reward": 1.0750001668930054, + "reward_std": 0.08610443770885468, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07500000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 415.6250305175781, + "completions/mean_terminated_length": 415.6250305175781, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.46530822801134897, + "grad_norm": 0.6879647374153137, + "kl": 0.06134033203125, + "learning_rate": 1e-06, + "loss": 0.0209, + "num_tokens": 51350991.0, + "reward": 1.1281250715255737, + "reward_std": 0.08505426347255707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 367.08038330078125, + "completions/mean_terminated_length": 367.08038330078125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.46633995357234975, + "grad_norm": 0.5170190930366516, + "kl": 0.0662841796875, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 51462230.0, + "reward": 1.0937501192092896, + "reward_std": 0.056425854563713074, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 342.7500305175781, + "completions/mean_terminated_length": 342.7500305175781, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.46737167913335054, + "grad_norm": 0.7710051536560059, + "kl": 0.06256103515625, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 51559460.0, + "reward": 1.146875023841858, + "reward_std": 0.07919223606586456, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 422.65179443359375, + "completions/mean_terminated_length": 422.65179443359375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.4684034046943513, + "grad_norm": 0.6395953893661499, + "kl": 0.0594482421875, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 51669083.0, + "reward": 1.0593750476837158, + "reward_std": 0.06025034934282303, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 402.9196472167969, + "completions/mean_terminated_length": 402.9196472167969, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.46943513025535205, + "grad_norm": 0.5968127846717834, + "kl": 0.05462646484375, + "learning_rate": 1e-06, + "loss": 0.0182, + "num_tokens": 51778162.0, + "reward": 1.1468751430511475, + "reward_std": 0.054419707506895065, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 367.3125305175781, + "completions/mean_terminated_length": 367.3125305175781, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.47046685581635284, + "grad_norm": 0.7138005495071411, + "kl": 0.05780029296875, + "learning_rate": 1e-06, + "loss": -0.0184, + "num_tokens": 51886410.0, + "reward": 1.1625001430511475, + "reward_std": 0.10945840179920197, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.1753375083208084, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1215.0, + "completions/max_terminated_length": 1215.0, + "completions/mean_length": 452.2500305175781, + "completions/mean_terminated_length": 452.2500305175781, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.4714985813773536, + "grad_norm": 0.7615479230880737, + "kl": 0.05108642578125, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 52010506.0, + "reward": 1.1968752145767212, + "reward_std": 0.10396476089954376, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.21416938304901123, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 391.77679443359375, + "completions/mean_terminated_length": 391.77679443359375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.4725303069383544, + "grad_norm": 0.7827943563461304, + "kl": 0.064208984375, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 52121627.0, + "reward": 1.1375001668930054, + "reward_std": 0.10603370517492294, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 406.1696472167969, + "completions/mean_terminated_length": 406.1696472167969, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.4735620324993552, + "grad_norm": 0.5137292146682739, + "kl": 0.05517578125, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 52226366.0, + "reward": 1.1468751430511475, + "reward_std": 0.04855767637491226, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1125.0, + "completions/max_terminated_length": 1125.0, + "completions/mean_length": 472.5000305175781, + "completions/mean_terminated_length": 472.5000305175781, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.4745937580603559, + "grad_norm": 0.6859979033470154, + "kl": 0.0482177734375, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 52340809.0, + "reward": 1.140625, + "reward_std": 0.10119043290615082, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2388.0, + "completions/max_terminated_length": 2388.0, + "completions/mean_length": 459.4107360839844, + "completions/mean_terminated_length": 459.4107360839844, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.4756254836213567, + "grad_norm": 0.5346077680587769, + "kl": 0.0545654296875, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 52458455.0, + "reward": 1.0656250715255737, + "reward_std": 0.058863185346126556, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.13722331821918488, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 416.5089416503906, + "completions/mean_terminated_length": 416.5089416503906, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.4766572091823575, + "grad_norm": 0.448335736989975, + "kl": 0.05621337890625, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 52565689.0, + "reward": 1.1593750715255737, + "reward_std": 0.04130847379565239, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 417.5625305175781, + "completions/mean_terminated_length": 417.5625305175781, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.4776889347433583, + "grad_norm": 0.5065658688545227, + "kl": 0.064697265625, + "learning_rate": 1e-06, + "loss": -0.0151, + "num_tokens": 52683478.0, + "reward": 1.125, + "reward_std": 0.043346013873815536, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 429.7589416503906, + "completions/mean_terminated_length": 429.7589416503906, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.47872066030435906, + "grad_norm": 0.5872586369514465, + "kl": 0.0594482421875, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 52799266.0, + "reward": 1.1437500715255737, + "reward_std": 0.06469383090734482, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 399.9464416503906, + "completions/mean_terminated_length": 399.9464416503906, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.4797523858653598, + "grad_norm": 0.6111489534378052, + "kl": 0.05621337890625, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 52907620.0, + "reward": 1.1250001192092896, + "reward_std": 0.05784441530704498, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 483.46429443359375, + "completions/mean_terminated_length": 483.46429443359375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.48078411142636057, + "grad_norm": 0.6069656014442444, + "kl": 0.0506591796875, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 53039930.0, + "reward": 1.0843751430511475, + "reward_std": 0.0812297835946083, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 432.33038330078125, + "completions/mean_terminated_length": 432.33038330078125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.48181583698736136, + "grad_norm": 0.6573135256767273, + "kl": 0.053955078125, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 53156692.0, + "reward": 1.1281251907348633, + "reward_std": 0.0612691231071949, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1135.0, + "completions/max_terminated_length": 1135.0, + "completions/mean_length": 406.70538330078125, + "completions/mean_terminated_length": 406.70538330078125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.48284756254836214, + "grad_norm": 0.6067453026771545, + "kl": 0.05816650390625, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 53271130.0, + "reward": 1.1781251430511475, + "reward_std": 0.08326731622219086, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.17812499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.175758495926857, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1820.0, + "completions/max_terminated_length": 1820.0, + "completions/mean_length": 475.1964416503906, + "completions/mean_terminated_length": 475.1964416503906, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.4838792881093629, + "grad_norm": 0.5233703851699829, + "kl": 0.0491943359375, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 53396431.0, + "reward": 1.078125, + "reward_std": 0.06228789687156677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 426.8571472167969, + "completions/mean_terminated_length": 426.8571472167969, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.4849110136703637, + "grad_norm": 0.6300012469291687, + "kl": 0.06439208984375, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 53516612.0, + "reward": 1.0875000953674316, + "reward_std": 0.07678630203008652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08749999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.15223558247089386, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 380.64288330078125, + "completions/mean_terminated_length": 380.64288330078125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.48594273923136444, + "grad_norm": 0.6337840557098389, + "kl": 0.0531005859375, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 53625506.0, + "reward": 1.1500000953674316, + "reward_std": 0.06611239165067673, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 379.6250305175781, + "completions/mean_terminated_length": 379.6250305175781, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.4869744647923652, + "grad_norm": 0.6945529580116272, + "kl": 0.0655517578125, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 53728532.0, + "reward": 1.1437500715255737, + "reward_std": 0.07536773383617401, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 451.33929443359375, + "completions/mean_terminated_length": 451.33929443359375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.488006190353366, + "grad_norm": 0.5490451455116272, + "kl": 0.05792236328125, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 53845365.0, + "reward": 1.0812500715255737, + "reward_std": 0.06025035306811333, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14843402802944183, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 391.4107360839844, + "completions/mean_terminated_length": 391.4107360839844, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.4890379159143668, + "grad_norm": 0.6644481420516968, + "kl": 0.06201171875, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 53950705.0, + "reward": 1.1437500715255737, + "reward_std": 0.08224855363368988, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 431.27679443359375, + "completions/mean_terminated_length": 431.27679443359375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.4900696414753676, + "grad_norm": 0.6886666417121887, + "kl": 0.057861328125, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 54072858.0, + "reward": 1.1156251430511475, + "reward_std": 0.08607304841279984, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.20146213471889496, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 463.544677734375, + "completions/mean_terminated_length": 463.544677734375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.4911013670363683, + "grad_norm": 0.7024446129798889, + "kl": 0.05206298828125, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 54195594.0, + "reward": 1.1781251430511475, + "reward_std": 0.10461514443159103, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.17812500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.2348836213350296, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 426.8035888671875, + "completions/mean_terminated_length": 426.8035888671875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.4921330925973691, + "grad_norm": 0.6875271797180176, + "kl": 0.05548095703125, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 54305878.0, + "reward": 1.1281250715255737, + "reward_std": 0.09051652997732162, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 460.4107360839844, + "completions/mean_terminated_length": 460.4107360839844, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.4931648181583699, + "grad_norm": 0.5122353434562683, + "kl": 0.05712890625, + "learning_rate": 1e-06, + "loss": 0.017, + "num_tokens": 54423580.0, + "reward": 1.1062501668930054, + "reward_std": 0.05682564154267311, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 435.1339416503906, + "completions/mean_terminated_length": 435.1339416503906, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.49419654371937066, + "grad_norm": 0.695402204990387, + "kl": 0.0577392578125, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 54546018.0, + "reward": 1.1406251192092896, + "reward_std": 0.08949775993824005, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 379.65179443359375, + "completions/mean_terminated_length": 379.65179443359375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.49522826928037145, + "grad_norm": 0.8098606467247009, + "kl": 0.05194091796875, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 54648809.0, + "reward": 1.1687501668930054, + "reward_std": 0.09292247146368027, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 416.26788330078125, + "completions/mean_terminated_length": 416.26788330078125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.4962599948413722, + "grad_norm": 0.5609670281410217, + "kl": 0.06036376953125, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 54762638.0, + "reward": 1.1343750953674316, + "reward_std": 0.06367506831884384, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 433.0000305175781, + "completions/mean_terminated_length": 433.0000305175781, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.49729172040237296, + "grad_norm": 0.6345481872558594, + "kl": 0.06072998046875, + "learning_rate": 1e-06, + "loss": -0.0155, + "num_tokens": 54876269.0, + "reward": 1.1468751430511475, + "reward_std": 0.08709181845188141, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1076.0, + "completions/max_terminated_length": 1076.0, + "completions/mean_length": 427.27679443359375, + "completions/mean_terminated_length": 427.27679443359375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.49832344596337375, + "grad_norm": 0.6227778792381287, + "kl": 0.05731201171875, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 54992227.0, + "reward": 1.1156251430511475, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 466.4732360839844, + "completions/mean_terminated_length": 466.4732360839844, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.49935517152437453, + "grad_norm": 0.521126389503479, + "kl": 0.05126953125, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 55111175.0, + "reward": 1.1312501430511475, + "reward_std": 0.05784441903233528, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 449.732177734375, + "completions/mean_terminated_length": 449.732177734375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.5003868970853753, + "grad_norm": 0.5438706278800964, + "kl": 0.0584716796875, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 55226554.0, + "reward": 1.1187500953674316, + "reward_std": 0.07197443395853043, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 448.08929443359375, + "completions/mean_terminated_length": 448.08929443359375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.501418622646376, + "grad_norm": 0.5385457873344421, + "kl": 0.0576171875, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 55343184.0, + "reward": 1.1437500715255737, + "reward_std": 0.07919223606586456, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 412.6875305175781, + "completions/mean_terminated_length": 412.6875305175781, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.5024503482073769, + "grad_norm": 0.5771428346633911, + "kl": 0.05352783203125, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 55456661.0, + "reward": 1.1687501668930054, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1353.0, + "completions/max_terminated_length": 1353.0, + "completions/mean_length": 430.0357360839844, + "completions/mean_terminated_length": 430.0357360839844, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.5034820737683776, + "grad_norm": 0.7232245802879333, + "kl": 0.06512451171875, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 55572011.0, + "reward": 1.140625, + "reward_std": 0.0981341153383255, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 426.83929443359375, + "completions/mean_terminated_length": 426.83929443359375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.5045137993293783, + "grad_norm": 0.5774347186088562, + "kl": 0.0574951171875, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 55688291.0, + "reward": 1.1437500715255737, + "reward_std": 0.06130051985383034, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296108603477478, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 421.4107360839844, + "completions/mean_terminated_length": 421.4107360839844, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.5055455248903792, + "grad_norm": 0.6256279945373535, + "kl": 0.0693359375, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 55800477.0, + "reward": 1.1000001430511475, + "reward_std": 0.08706042170524597, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 500.4107360839844, + "completions/mean_terminated_length": 500.4107360839844, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.5065772504513799, + "grad_norm": 0.5111896991729736, + "kl": 0.052734375, + "learning_rate": 1e-06, + "loss": -0.0063, + "num_tokens": 55928987.0, + "reward": 1.0562500953674316, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 506.52679443359375, + "completions/mean_terminated_length": 506.52679443359375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.5076089760123808, + "grad_norm": 0.534400224685669, + "kl": 0.05596923828125, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 56061479.0, + "reward": 1.0968750715255737, + "reward_std": 0.06367506086826324, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 470.15179443359375, + "completions/mean_terminated_length": 470.15179443359375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.5086407015733815, + "grad_norm": 0.5752992630004883, + "kl": 0.06439208984375, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 56186124.0, + "reward": 1.0562500953674316, + "reward_std": 0.054419707506895065, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 459.1250305175781, + "completions/mean_terminated_length": 459.1250305175781, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.5096724271343822, + "grad_norm": 0.6120277047157288, + "kl": 0.06231689453125, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 56308988.0, + "reward": 1.0660713911056519, + "reward_std": 0.0911223366856575, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1065.0, + "completions/max_terminated_length": 1065.0, + "completions/mean_length": 416.8214416503906, + "completions/mean_terminated_length": 416.8214416503906, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.510704152695383, + "grad_norm": 0.5477854013442993, + "kl": 0.0775146484375, + "learning_rate": 1e-06, + "loss": -0.0134, + "num_tokens": 56408158.0, + "reward": 1.1937501430511475, + "reward_std": 0.06367506086826324, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19375000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.174774631857872, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 436.8214416503906, + "completions/mean_terminated_length": 436.8214416503906, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.5117358782563838, + "grad_norm": 0.7140277028083801, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 56517233.0, + "reward": 1.0750000476837158, + "reward_std": 0.09394123405218124, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 415.1785888671875, + "completions/mean_terminated_length": 415.1785888671875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.5127676038173846, + "grad_norm": 0.6335656642913818, + "kl": 0.0782470703125, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 56623184.0, + "reward": 1.1500000953674316, + "reward_std": 0.0674995556473732, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1242.0, + "completions/max_terminated_length": 1242.0, + "completions/mean_length": 464.1160888671875, + "completions/mean_terminated_length": 464.1160888671875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.5137993293783853, + "grad_norm": 0.6236574053764343, + "kl": 0.0736083984375, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 56738354.0, + "reward": 1.1406251192092896, + "reward_std": 0.0778050646185875, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 443.01788330078125, + "completions/mean_terminated_length": 443.01788330078125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.5148310549393861, + "grad_norm": 0.6208124756813049, + "kl": 0.0714111328125, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 56847584.0, + "reward": 1.131250023841858, + "reward_std": 0.08505427092313766, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13125000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1099.0, + "completions/max_terminated_length": 1099.0, + "completions/mean_length": 581.5089721679688, + "completions/mean_terminated_length": 581.5089721679688, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.5158627805003869, + "grad_norm": 0.4899929463863373, + "kl": 0.05731201171875, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 56992947.0, + "reward": 1.0562500953674316, + "reward_std": 0.05303255096077919, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05624999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.12912124395370483, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 463.8482360839844, + "completions/mean_terminated_length": 463.8482360839844, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.5168945060613876, + "grad_norm": 0.41739174723625183, + "kl": 0.0767822265625, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 57113820.0, + "reward": 1.0937501192092896, + "reward_std": 0.04130847007036209, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 426.9910888671875, + "completions/mean_terminated_length": 426.9910888671875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.5179262316223885, + "grad_norm": 0.6298090815544128, + "kl": 0.079345703125, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 57222592.0, + "reward": 1.234375, + "reward_std": 0.07882384210824966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.234375, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1367.0, + "completions/max_terminated_length": 1367.0, + "completions/mean_length": 508.7410888671875, + "completions/mean_terminated_length": 508.7410888671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.5189579571833892, + "grad_norm": 0.5767647624015808, + "kl": 0.071044921875, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 57343628.0, + "reward": 1.1312501430511475, + "reward_std": 0.06953709572553635, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13124999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 419.95538330078125, + "completions/mean_terminated_length": 419.95538330078125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.51998968274439, + "grad_norm": 0.5730735063552856, + "kl": 0.080322265625, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 57456148.0, + "reward": 1.1125000715255737, + "reward_std": 0.06851832568645477, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11249999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 482.3482360839844, + "completions/mean_terminated_length": 482.3482360839844, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.5210214083053908, + "grad_norm": 0.7494083046913147, + "kl": 0.082763671875, + "learning_rate": 1e-06, + "loss": 0.0174, + "num_tokens": 57584952.0, + "reward": 1.1098215579986572, + "reward_std": 0.11654523760080338, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1121.0, + "completions/max_terminated_length": 1121.0, + "completions/mean_length": 560.1875, + "completions/mean_terminated_length": 560.1875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.5220531338663915, + "grad_norm": 0.5996860861778259, + "kl": 0.064697265625, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 57712674.0, + "reward": 1.0843751430511475, + "reward_std": 0.0919036939740181, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.17732131481170654, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 502.3035888671875, + "completions/mean_terminated_length": 502.3035888671875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.5230848594273924, + "grad_norm": 0.49378275871276855, + "kl": 0.070068359375, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 57835166.0, + "reward": 1.0937501192092896, + "reward_std": 0.05886319279670715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569157898426056, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 483.26788330078125, + "completions/mean_terminated_length": 483.26788330078125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.5241165849883931, + "grad_norm": 0.6792031526565552, + "kl": 0.06884765625, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 57956702.0, + "reward": 1.1843751668930054, + "reward_std": 0.09915288537740707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.18437500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17553409934043884, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 552.7142944335938, + "completions/mean_terminated_length": 552.7142944335938, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5251483105493938, + "grad_norm": 0.3717767596244812, + "kl": 0.068359375, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 58085169.0, + "reward": 1.053125023841858, + "reward_std": 0.03507804498076439, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05312499776482582, + "rewards/curriculum_aware_reward_fn/std": 0.1261489987373352, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1499.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 556.4107666015625, + "completions/mean_terminated_length": 556.4107666015625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.5261800361103947, + "grad_norm": 0.47721174359321594, + "kl": 0.0660400390625, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 58219279.0, + "reward": 1.0625, + "reward_std": 0.0612691231071949, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0625, + "rewards/curriculum_aware_reward_fn/std": 0.13465002179145813, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 493.4107360839844, + "completions/mean_terminated_length": 493.4107360839844, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.5272117616713954, + "grad_norm": 0.6741013526916504, + "kl": 0.0714111328125, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 58338990.0, + "reward": 1.1218751668930054, + "reward_std": 0.08709181845188141, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 514.4910888671875, + "completions/mean_terminated_length": 514.4910888671875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.5282434872323962, + "grad_norm": 0.621870219707489, + "kl": 0.0732421875, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 58460460.0, + "reward": 1.1160715818405151, + "reward_std": 0.09299345314502716, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.1249999925494194, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 568.6607666015625, + "completions/mean_terminated_length": 568.6607666015625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.529275212793397, + "grad_norm": 0.6457920074462891, + "kl": 0.068115234375, + "learning_rate": 1e-06, + "loss": 0.0252, + "num_tokens": 58596495.0, + "reward": 1.1254465579986572, + "reward_std": 0.12541785836219788, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1916.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 594.7589721679688, + "completions/mean_terminated_length": 594.7589721679688, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.5303069383543977, + "grad_norm": 0.6898475885391235, + "kl": 0.065673828125, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 58735922.0, + "reward": 1.1218750476837158, + "reward_std": 0.10742086172103882, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 537.3214721679688, + "completions/mean_terminated_length": 537.3214721679688, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.5313386639153985, + "grad_norm": 0.5636951327323914, + "kl": 0.072998046875, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 58874929.0, + "reward": 1.1281250715255737, + "reward_std": 0.08261694014072418, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16936303675174713, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1224.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 539.0178833007812, + "completions/mean_terminated_length": 539.0178833007812, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.5323703894763993, + "grad_norm": 0.45992597937583923, + "kl": 0.0703125, + "learning_rate": 1e-06, + "loss": 0.0209, + "num_tokens": 59009282.0, + "reward": 1.0906251668930054, + "reward_std": 0.0612691268324852, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1305.0, + "completions/max_terminated_length": 1305.0, + "completions/mean_length": 499.58929443359375, + "completions/mean_terminated_length": 499.58929443359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.5334021150374001, + "grad_norm": 0.3661348223686218, + "kl": 0.07275390625, + "learning_rate": 1e-06, + "loss": 0.0118, + "num_tokens": 59131631.0, + "reward": 1.1723216772079468, + "reward_std": 0.0601193830370903, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.21520207822322845, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1206.0, + "completions/max_terminated_length": 1206.0, + "completions/mean_length": 483.08038330078125, + "completions/mean_terminated_length": 483.08038330078125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.5344338405984008, + "grad_norm": 0.7646259069442749, + "kl": 0.07177734375, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 59252355.0, + "reward": 1.1218751668930054, + "reward_std": 0.11427027732133865, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 510.5625305175781, + "completions/mean_terminated_length": 510.5625305175781, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.5354655661594016, + "grad_norm": 0.6553826928138733, + "kl": 0.06451416015625, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 59373826.0, + "reward": 1.1375000476837158, + "reward_std": 0.10803984105587006, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170315980911255, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 511.0714416503906, + "completions/mean_terminated_length": 511.0714416503906, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.5364972917204024, + "grad_norm": 0.5374631285667419, + "kl": 0.0660400390625, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 59500408.0, + "reward": 1.0906251668930054, + "reward_std": 0.07780507206916809, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 514.732177734375, + "completions/mean_terminated_length": 514.732177734375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.5375290172814031, + "grad_norm": 0.5410560965538025, + "kl": 0.066162109375, + "learning_rate": 1e-06, + "loss": -0.0133, + "num_tokens": 59633840.0, + "reward": 1.0723215341567993, + "reward_std": 0.080716073513031, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 483.14288330078125, + "completions/mean_terminated_length": 483.14288330078125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.538560742842404, + "grad_norm": 0.4773089289665222, + "kl": 0.0733642578125, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 59756818.0, + "reward": 1.1187500953674316, + "reward_std": 0.04957644268870354, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1191.0, + "completions/max_terminated_length": 1191.0, + "completions/mean_length": 482.8035888671875, + "completions/mean_terminated_length": 482.8035888671875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.5395924684034047, + "grad_norm": 0.3887348473072052, + "kl": 0.06591796875, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 59875929.0, + "reward": 1.0812500715255737, + "reward_std": 0.03405927121639252, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 472.857177734375, + "completions/mean_terminated_length": 472.857177734375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.5406241939644054, + "grad_norm": 0.7101907730102539, + "kl": 0.064453125, + "learning_rate": 1e-06, + "loss": 0.0268, + "num_tokens": 59997209.0, + "reward": 1.1723215579986572, + "reward_std": 0.11900390684604645, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 454.4375305175781, + "completions/mean_terminated_length": 454.4375305175781, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.5416559195254063, + "grad_norm": 0.6443462371826172, + "kl": 0.07470703125, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 60117756.0, + "reward": 1.0968750715255737, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09687500447034836, + "rewards/curriculum_aware_reward_fn/std": 0.15729716420173645, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 524.875, + "completions/mean_terminated_length": 492.7027282714844, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.542687645086407, + "grad_norm": 0.5063578486442566, + "kl": 0.06024169921875, + "learning_rate": 1e-06, + "loss": 0.0342, + "num_tokens": 60247921.0, + "reward": 1.0660715103149414, + "reward_std": 0.09110311418771744, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.07500000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 468.0089416503906, + "completions/mean_terminated_length": 468.0089416503906, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.5437193706474078, + "grad_norm": 0.6476441621780396, + "kl": 0.0732421875, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 60367265.0, + "reward": 1.1062501668930054, + "reward_std": 0.10220920294523239, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 482.4464416503906, + "completions/mean_terminated_length": 482.4464416503906, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.5447510962084086, + "grad_norm": 0.6608678698539734, + "kl": 0.06353759765625, + "learning_rate": 1e-06, + "loss": -0.0134, + "num_tokens": 60495870.0, + "reward": 1.1343750953674316, + "reward_std": 0.09193507581949234, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 1126.0, + "completions/mean_length": 552.7678833007812, + "completions/mean_terminated_length": 552.7678833007812, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5457828217694093, + "grad_norm": 0.650008499622345, + "kl": 0.06597900390625, + "learning_rate": 1e-06, + "loss": 0.0118, + "num_tokens": 60635389.0, + "reward": 1.078125, + "reward_std": 0.0812297835946083, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 441.58038330078125, + "completions/mean_terminated_length": 441.58038330078125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.5468145473304101, + "grad_norm": 0.43306875228881836, + "kl": 0.0657958984375, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 60754881.0, + "reward": 1.0593750476837158, + "reward_std": 0.038902536034584045, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 441.5357360839844, + "completions/mean_terminated_length": 441.5357360839844, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.5478462728914109, + "grad_norm": 0.7140151858329773, + "kl": 0.069091796875, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 60871015.0, + "reward": 1.1375000476837158, + "reward_std": 0.09572818130254745, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 481.357177734375, + "completions/mean_terminated_length": 481.357177734375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.5488779984524117, + "grad_norm": 0.6157094836235046, + "kl": 0.064453125, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 60989068.0, + "reward": 1.0843751430511475, + "reward_std": 0.10017165541648865, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 531.7678833007812, + "completions/mean_terminated_length": 531.7678833007812, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.5499097240134124, + "grad_norm": 0.35044318437576294, + "kl": 0.05377197265625, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 61114783.0, + "reward": 1.0843751430511475, + "reward_std": 0.03202172368764877, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 464.21429443359375, + "completions/mean_terminated_length": 464.21429443359375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.5509414495744132, + "grad_norm": 0.5607831478118896, + "kl": 0.062744140625, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 61241955.0, + "reward": 1.1406251192092896, + "reward_std": 0.06956849992275238, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1406249850988388, + "rewards/curriculum_aware_reward_fn/std": 0.17236186563968658, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 494.96429443359375, + "completions/mean_terminated_length": 494.96429443359375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.551973175135414, + "grad_norm": 0.6276484727859497, + "kl": 0.0616455078125, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 61356149.0, + "reward": 1.0723214149475098, + "reward_std": 0.09315988421440125, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 509.419677734375, + "completions/mean_terminated_length": 509.419677734375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.5530049006964147, + "grad_norm": 0.5943368673324585, + "kl": 0.05865478515625, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 61483617.0, + "reward": 1.09375, + "reward_std": 0.0674995481967926, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1164.0, + "completions/max_terminated_length": 1164.0, + "completions/mean_length": 556.4464721679688, + "completions/mean_terminated_length": 556.4464721679688, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.5540366262574156, + "grad_norm": 0.45444533228874207, + "kl": 0.05535888671875, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 61618039.0, + "reward": 1.100000023841858, + "reward_std": 0.060250356793403625, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09999998658895493, + "rewards/curriculum_aware_reward_fn/std": 0.15882450342178345, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 488.044677734375, + "completions/mean_terminated_length": 488.044677734375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.5550683518184163, + "grad_norm": 0.49808990955352783, + "kl": 0.06365966796875, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 61745598.0, + "reward": 1.0812500715255737, + "reward_std": 0.0660809874534607, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08124999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1487.0, + "completions/max_terminated_length": 1487.0, + "completions/mean_length": 534.7232666015625, + "completions/mean_terminated_length": 534.7232666015625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.556100077379417, + "grad_norm": 0.369132399559021, + "kl": 0.06011962890625, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 61879071.0, + "reward": 1.0750000476837158, + "reward_std": 0.03547782823443413, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.144259512424469, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 482.83929443359375, + "completions/mean_terminated_length": 482.83929443359375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.5571318029404179, + "grad_norm": 0.39808061718940735, + "kl": 0.0628662109375, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 61998235.0, + "reward": 1.15625, + "reward_std": 0.04028969630599022, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.174774631857872, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 445.3035888671875, + "completions/mean_terminated_length": 445.3035888671875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.5581635285014186, + "grad_norm": 0.7094811201095581, + "kl": 0.0657958984375, + "learning_rate": 1e-06, + "loss": 0.0376, + "num_tokens": 62112565.0, + "reward": 1.1593750715255737, + "reward_std": 0.1025775894522667, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.1750844269990921, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 454.7232360839844, + "completions/mean_terminated_length": 454.7232360839844, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.5591952540624194, + "grad_norm": 0.6402541995048523, + "kl": 0.0628662109375, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 62222651.0, + "reward": 1.1375000476837158, + "reward_std": 0.09088490903377533, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170318961143494, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 481.2500305175781, + "completions/mean_terminated_length": 481.2500305175781, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.5602269796234202, + "grad_norm": 0.7908324003219604, + "kl": 0.06268310546875, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 62341007.0, + "reward": 1.1218751668930054, + "reward_std": 0.12392540276050568, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 472.919677734375, + "completions/mean_terminated_length": 472.919677734375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.5612587051844209, + "grad_norm": 0.493557333946228, + "kl": 0.064208984375, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 62467740.0, + "reward": 1.1125001907348633, + "reward_std": 0.04615173488855362, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11249999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.16419336199760437, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 475.96429443359375, + "completions/mean_terminated_length": 475.96429443359375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.5622904307454217, + "grad_norm": 0.5978304743766785, + "kl": 0.05908203125, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 62586631.0, + "reward": 1.1000001430511475, + "reward_std": 0.07641790807247162, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 503.8839416503906, + "completions/mean_terminated_length": 503.8839416503906, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.5633221563064225, + "grad_norm": 0.38978615403175354, + "kl": 0.06207275390625, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 62713493.0, + "reward": 1.1468751430511475, + "reward_std": 0.04130847379565239, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.17350146174430847, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 475.6160888671875, + "completions/mean_terminated_length": 475.6160888671875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.5643538818674233, + "grad_norm": 0.5205122828483582, + "kl": 0.06402587890625, + "learning_rate": 1e-06, + "loss": -0.0149, + "num_tokens": 62834480.0, + "reward": 1.1218751668930054, + "reward_std": 0.05784441903233528, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 437.3839416503906, + "completions/mean_terminated_length": 437.3839416503906, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.565385607428424, + "grad_norm": 0.7416389584541321, + "kl": 0.0648193359375, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 62945249.0, + "reward": 1.1687501668930054, + "reward_std": 0.12115109711885452, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17567436397075653, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 468.8750305175781, + "completions/mean_terminated_length": 468.8750305175781, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.5664173329894248, + "grad_norm": 0.7131133079528809, + "kl": 0.06671142578125, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 63073230.0, + "reward": 1.1129463911056519, + "reward_std": 0.13358858227729797, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.12187498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.17395521700382233, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 520.1785888671875, + "completions/mean_terminated_length": 487.9639892578125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.5674490585504256, + "grad_norm": 0.5437191128730774, + "kl": 0.06109619140625, + "learning_rate": 1e-06, + "loss": 0.034, + "num_tokens": 63195057.0, + "reward": 1.1348214149475098, + "reward_std": 0.09237737953662872, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 460.2500305175781, + "completions/mean_terminated_length": 460.2500305175781, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.5684807841114263, + "grad_norm": 0.5379366278648376, + "kl": 0.0653076171875, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 63309402.0, + "reward": 1.1281250715255737, + "reward_std": 0.0671311616897583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.16936305165290833, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 493.6964416503906, + "completions/mean_terminated_length": 493.6964416503906, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.5695125096724272, + "grad_norm": 0.5773839354515076, + "kl": 0.06378173828125, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 63438798.0, + "reward": 1.0593751668930054, + "reward_std": 0.06268768012523651, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.05937499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1319519579410553, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 489.7232360839844, + "completions/mean_terminated_length": 489.7232360839844, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.5705442352334279, + "grad_norm": 0.5632832646369934, + "kl": 0.05279541015625, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 63558145.0, + "reward": 1.134374976158142, + "reward_std": 0.0674995481967926, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1010.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 471.5625305175781, + "completions/mean_terminated_length": 471.5625305175781, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.5715759607944286, + "grad_norm": 0.5308297872543335, + "kl": 0.0601806640625, + "learning_rate": 1e-06, + "loss": -0.0149, + "num_tokens": 63677096.0, + "reward": 1.1062501668930054, + "reward_std": 0.07055586576461792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 427.70538330078125, + "completions/mean_terminated_length": 427.70538330078125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.5726076863554295, + "grad_norm": 0.6717932224273682, + "kl": 0.0667724609375, + "learning_rate": 1e-06, + "loss": -0.0149, + "num_tokens": 63787894.0, + "reward": 1.1500000953674316, + "reward_std": 0.08709181845188141, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 407.8035888671875, + "completions/mean_terminated_length": 407.8035888671875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.5736394119164302, + "grad_norm": 0.6472453474998474, + "kl": 0.068115234375, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 63890800.0, + "reward": 1.1375000476837158, + "reward_std": 0.07438036054372787, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17170317471027374, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 431.83038330078125, + "completions/mean_terminated_length": 431.83038330078125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.574671137477431, + "grad_norm": 0.7740367650985718, + "kl": 0.064453125, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 64010733.0, + "reward": 1.21875, + "reward_std": 0.09434102475643158, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21875, + "rewards/curriculum_aware_reward_fn/std": 0.17020456492900848, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 425.90179443359375, + "completions/mean_terminated_length": 425.90179443359375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5757028630384318, + "grad_norm": 0.6837030053138733, + "kl": 0.08349609375, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 64118634.0, + "reward": 1.1156251430511475, + "reward_std": 0.07197443395853043, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11562498658895493, + "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 476.1607360839844, + "completions/mean_terminated_length": 476.1607360839844, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.5767345885994325, + "grad_norm": 0.5691381692886353, + "kl": 0.05963134765625, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 64243518.0, + "reward": 1.0843751430511475, + "reward_std": 0.0671311616897583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15037958323955536, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 508.7857360839844, + "completions/mean_terminated_length": 508.7857360839844, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.5777663141604333, + "grad_norm": 0.5485077500343323, + "kl": 0.05517578125, + "learning_rate": 1e-06, + "loss": -0.0099, + "num_tokens": 64372532.0, + "reward": 1.1066964864730835, + "reward_std": 0.10006203502416611, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.11562500149011612, + "rewards/curriculum_aware_reward_fn/std": 0.1653594672679901, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 455.6964416503906, + "completions/mean_terminated_length": 455.6964416503906, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.5787980397214341, + "grad_norm": 0.6604443192481995, + "kl": 0.0601806640625, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 64496601.0, + "reward": 1.1343750953674316, + "reward_std": 0.06509362161159515, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 431.6339416503906, + "completions/mean_terminated_length": 431.6339416503906, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.5798297652824349, + "grad_norm": 0.613683819770813, + "kl": 0.09552001953125, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 64608560.0, + "reward": 1.1843751668930054, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.18437500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.17553408443927765, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 409.02679443359375, + "completions/mean_terminated_length": 409.02679443359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.5808614908434356, + "grad_norm": 0.6886749863624573, + "kl": 0.0592041015625, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 64719429.0, + "reward": 1.1187500953674316, + "reward_std": 0.10359636694192886, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.11874999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.16645818948745728, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 417.1339416503906, + "completions/mean_terminated_length": 417.1339416503906, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.5818932164044365, + "grad_norm": 0.6410400867462158, + "kl": 0.0565185546875, + "learning_rate": 1e-06, + "loss": -0.0076, + "num_tokens": 64835161.0, + "reward": 1.1968752145767212, + "reward_std": 0.08746021240949631, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17440778017044067, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 451.357177734375, + "completions/mean_terminated_length": 451.357177734375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5829249419654372, + "grad_norm": 0.6501334309577942, + "kl": 0.06207275390625, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 64963273.0, + "reward": 1.1250001192092896, + "reward_std": 0.08122977614402771, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 456.83038330078125, + "completions/mean_terminated_length": 456.83038330078125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.5839566675264379, + "grad_norm": 0.34422266483306885, + "kl": 0.05535888671875, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 65084305.0, + "reward": 1.1000001430511475, + "reward_std": 0.035477831959724426, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10000000149011612, + "rewards/curriculum_aware_reward_fn/std": 0.15882451832294464, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 453.9285888671875, + "completions/mean_terminated_length": 453.9285888671875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.5849883930874388, + "grad_norm": 0.4713008999824524, + "kl": 0.06524658203125, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 65204053.0, + "reward": 1.125, + "reward_std": 0.05096360668540001, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.1985306292772293, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 482.1964416503906, + "completions/mean_terminated_length": 482.1964416503906, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.5860201186484395, + "grad_norm": 0.6445662975311279, + "kl": 0.0772705078125, + "learning_rate": 1e-06, + "loss": 0.0229, + "num_tokens": 65331482.0, + "reward": 1.09375, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 421.3839416503906, + "completions/mean_terminated_length": 421.3839416503906, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.5870518442094403, + "grad_norm": 0.6060656905174255, + "kl": 0.05853271484375, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 65449356.0, + "reward": 1.1062501668930054, + "reward_std": 0.0671311616897583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1181.0, + "completions/max_terminated_length": 1181.0, + "completions/mean_length": 414.0982360839844, + "completions/mean_terminated_length": 414.0982360839844, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.5880835697704411, + "grad_norm": 0.7277713418006897, + "kl": 0.06231689453125, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 65561199.0, + "reward": 1.1500000953674316, + "reward_std": 0.07438036799430847, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 425.7946472167969, + "completions/mean_terminated_length": 425.7946472167969, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.5891152953314418, + "grad_norm": 0.6139292120933533, + "kl": 0.05096435546875, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 65677621.0, + "reward": 1.1343750953674316, + "reward_std": 0.07095566391944885, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437500596046448, + "rewards/curriculum_aware_reward_fn/std": 0.17098432779312134, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 420.26788330078125, + "completions/mean_terminated_length": 420.26788330078125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.5901470208924426, + "grad_norm": 0.40907788276672363, + "kl": 0.0604248046875, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 65801278.0, + "reward": 1.09375, + "reward_std": 0.03507804498076439, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.15569156408309937, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 442.794677734375, + "completions/mean_terminated_length": 442.794677734375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.5911787464534434, + "grad_norm": 0.7001194357872009, + "kl": 0.059814453125, + "learning_rate": 1e-06, + "loss": -0.0214, + "num_tokens": 65922446.0, + "reward": 1.0906250476837158, + "reward_std": 0.10119043290615082, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09062499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.1540052592754364, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 375.0982360839844, + "completions/mean_terminated_length": 375.0982360839844, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.5922104720144442, + "grad_norm": 0.9055859446525574, + "kl": 0.06549072265625, + "learning_rate": 1e-06, + "loss": -0.0242, + "num_tokens": 66025770.0, + "reward": 1.1218751668930054, + "reward_std": 0.12392540276050568, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16749092936515808, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 357.65179443359375, + "completions/mean_terminated_length": 357.65179443359375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.5932421975754449, + "grad_norm": 0.8322078585624695, + "kl": 0.06396484375, + "learning_rate": 1e-06, + "loss": 0.0189, + "num_tokens": 66127049.0, + "reward": 1.1500000953674316, + "reward_std": 0.11288311332464218, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 346.6964416503906, + "completions/mean_terminated_length": 346.6964416503906, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.5942739231364457, + "grad_norm": 0.8472862839698792, + "kl": 0.06304931640625, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 66230832.0, + "reward": 1.125, + "reward_std": 0.10603369772434235, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.16845883429050446, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 445.46429443359375, + "completions/mean_terminated_length": 445.46429443359375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.5953056486974465, + "grad_norm": 0.5997181534767151, + "kl": 0.0504150390625, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 66349310.0, + "reward": 1.078125238418579, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.14639531075954437, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 381.1875305175781, + "completions/mean_terminated_length": 381.1875305175781, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.5963373742584472, + "grad_norm": 0.5325617790222168, + "kl": 0.056396484375, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 66459623.0, + "reward": 1.1500002145767212, + "reward_std": 0.054419711232185364, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.17398352921009064, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 420.3571472167969, + "completions/mean_terminated_length": 420.3571472167969, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.5973690998194481, + "grad_norm": 0.5707572102546692, + "kl": 0.06243896484375, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 66577795.0, + "reward": 1.1062501668930054, + "reward_std": 0.06025034934282303, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.10624999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.16165320575237274, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1044.0, + "completions/max_terminated_length": 1044.0, + "completions/mean_length": 419.8839416503906, + "completions/mean_terminated_length": 419.8839416503906, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.5984008253804488, + "grad_norm": 0.6615480780601501, + "kl": 0.06341552734375, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 66686863.0, + "reward": 1.0812500715255737, + "reward_std": 0.08021100610494614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.08125000447034836, + "rewards/curriculum_aware_reward_fn/std": 0.14843401312828064, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 359.9375305175781, + "completions/mean_terminated_length": 359.9375305175781, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.5994325509414495, + "grad_norm": 0.8319998979568481, + "kl": 0.06036376953125, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 66783085.0, + "reward": 1.2062499523162842, + "reward_std": 0.10600230097770691, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20624998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.17296110093593597, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 397.4196472167969, + "completions/mean_terminated_length": 397.4196472167969, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.6004642765024504, + "grad_norm": 0.7436203360557556, + "kl": 0.0596923828125, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 66894089.0, + "reward": 1.1656252145767212, + "reward_std": 0.09292246401309967, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16562499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.21508759260177612, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 374.9910888671875, + "completions/mean_terminated_length": 374.9910888671875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.6014960020634511, + "grad_norm": 0.6285607218742371, + "kl": 0.06005859375, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 66996897.0, + "reward": 1.28125, + "reward_std": 0.09340823441743851, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.3145943284034729, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 382.52679443359375, + "completions/mean_terminated_length": 382.52679443359375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.602527727624452, + "grad_norm": 0.8746618628501892, + "kl": 0.06982421875, + "learning_rate": 1e-06, + "loss": 0.0165, + "num_tokens": 67112211.0, + "reward": 1.296875, + "reward_std": 0.17254236340522766, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3052307665348053, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 450.8660888671875, + "completions/mean_terminated_length": 450.8660888671875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.6035594531854527, + "grad_norm": 0.6751203536987305, + "kl": 0.0572509765625, + "learning_rate": 1e-06, + "loss": -0.0107, + "num_tokens": 67227780.0, + "reward": 1.2379465103149414, + "reward_std": 0.14901825785636902, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.31035298109054565, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 417.1339416503906, + "completions/mean_terminated_length": 417.1339416503906, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.6045911787464534, + "grad_norm": 0.7396414279937744, + "kl": 0.05865478515625, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 67341223.0, + "reward": 1.318750023841858, + "reward_std": 0.1882440447807312, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31874996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3501688539981842, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 390.5000305175781, + "completions/mean_terminated_length": 390.5000305175781, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.6056229043074542, + "grad_norm": 0.8871374130249023, + "kl": 0.05767822265625, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 67453937.0, + "reward": 1.2468751668930054, + "reward_std": 0.24108856916427612, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.31035298109054565, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2066.0, + "completions/max_terminated_length": 2066.0, + "completions/mean_length": 462.2232360839844, + "completions/mean_terminated_length": 462.2232360839844, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.606654629868455, + "grad_norm": 0.6875455975532532, + "kl": 0.0528564453125, + "learning_rate": 1e-06, + "loss": 0.0508, + "num_tokens": 67570736.0, + "reward": 1.3125, + "reward_std": 0.18902026116847992, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.30935123562812805, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 376.1160888671875, + "completions/mean_terminated_length": 376.1160888671875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.6076863554294558, + "grad_norm": 0.9995030164718628, + "kl": 0.06695556640625, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 67690036.0, + "reward": 1.3000000715255737, + "reward_std": 0.24223896861076355, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3039529323577881, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 406.33929443359375, + "completions/mean_terminated_length": 406.33929443359375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.6087180809904565, + "grad_norm": 0.6139847040176392, + "kl": 0.0670166015625, + "learning_rate": 1e-06, + "loss": 0.0312, + "num_tokens": 67802067.0, + "reward": 1.1656250953674316, + "reward_std": 0.09088490903377533, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16562499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.22996585071086884, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 392.45538330078125, + "completions/mean_terminated_length": 392.45538330078125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.6097498065514573, + "grad_norm": 0.7633903622627258, + "kl": 0.0614013671875, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 67911290.0, + "reward": 1.3093751668930054, + "reward_std": 0.14392894506454468, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3313588798046112, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 421.5089416503906, + "completions/mean_terminated_length": 421.5089416503906, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.6107815321124581, + "grad_norm": 0.790761411190033, + "kl": 0.05780029296875, + "learning_rate": 1e-06, + "loss": -0.014, + "num_tokens": 68021135.0, + "reward": 1.1754463911056519, + "reward_std": 0.19190625846385956, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.18437500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.26972052454948425, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 396.1607360839844, + "completions/mean_terminated_length": 396.1607360839844, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.6118132576734588, + "grad_norm": 0.7488933801651001, + "kl": 0.0621337890625, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 68133097.0, + "reward": 1.2906252145767212, + "reward_std": 0.12345291674137115, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30406635999679565, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1066.0, + "completions/max_terminated_length": 1066.0, + "completions/mean_length": 434.6160888671875, + "completions/mean_terminated_length": 434.6160888671875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.6128449832344597, + "grad_norm": 0.6970859169960022, + "kl": 0.0609130859375, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 68251529.0, + "reward": 1.3156250715255737, + "reward_std": 0.10980459302663803, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3114938735961914, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 446.33038330078125, + "completions/mean_terminated_length": 446.33038330078125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.6138767087954604, + "grad_norm": 0.7522033452987671, + "kl": 0.06707763671875, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 68380800.0, + "reward": 1.3218750953674316, + "reward_std": 0.12741291522979736, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.3013319671154022, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 433.8839416503906, + "completions/mean_terminated_length": 433.8839416503906, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.6149084343564611, + "grad_norm": 0.7475255727767944, + "kl": 0.06756591796875, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 68501201.0, + "reward": 1.1687501668930054, + "reward_std": 0.11087696254253387, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16875001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.25724852085113525, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 435.8750305175781, + "completions/mean_terminated_length": 435.8750305175781, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.615940159917462, + "grad_norm": 0.6649345755577087, + "kl": 0.05682373046875, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 68617653.0, + "reward": 1.2312501668930054, + "reward_std": 0.13754288852214813, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23125000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2954707145690918, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1253.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 492.7857360839844, + "completions/mean_terminated_length": 492.7857360839844, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.6169718854784627, + "grad_norm": 0.6977077722549438, + "kl": 0.05615234375, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 68741424.0, + "reward": 1.1937501430511475, + "reward_std": 0.13318131864070892, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19375000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.2522979974746704, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 494.107177734375, + "completions/mean_terminated_length": 494.107177734375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.6180036110394636, + "grad_norm": 0.7319642901420593, + "kl": 0.060302734375, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 68870222.0, + "reward": 1.215625286102295, + "reward_std": 0.18056097626686096, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2905440032482147, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1302.0, + "completions/max_terminated_length": 1302.0, + "completions/mean_length": 444.33038330078125, + "completions/mean_terminated_length": 444.33038330078125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.6190353366004643, + "grad_norm": 0.7302688956260681, + "kl": 0.06500244140625, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 68996007.0, + "reward": 1.2875001430511475, + "reward_std": 0.18273977935314178, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3123941123485565, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 391.4910888671875, + "completions/mean_terminated_length": 391.4910888671875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.620067062161465, + "grad_norm": 0.73208087682724, + "kl": 0.072021484375, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 69102700.0, + "reward": 1.3468750715255737, + "reward_std": 0.12240273505449295, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.30984458327293396, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1196.0, + "completions/max_terminated_length": 1196.0, + "completions/mean_length": 432.6607360839844, + "completions/mean_terminated_length": 432.6607360839844, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.6210987877224659, + "grad_norm": 0.6676474809646606, + "kl": 0.06787109375, + "learning_rate": 1e-06, + "loss": 0.0282, + "num_tokens": 69218995.0, + "reward": 1.2468751668930054, + "reward_std": 0.14453645050525665, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24687497317790985, + "rewards/curriculum_aware_reward_fn/std": 0.2920324206352234, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1383.0, + "completions/max_terminated_length": 1383.0, + "completions/mean_length": 495.6785888671875, + "completions/mean_terminated_length": 495.6785888671875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.6221305132834666, + "grad_norm": 0.6866009831428528, + "kl": 0.06170654296875, + "learning_rate": 1e-06, + "loss": 0.0374, + "num_tokens": 69344560.0, + "reward": 1.2156251668930054, + "reward_std": 0.1428852528333664, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21562497317790985, + "rewards/curriculum_aware_reward_fn/std": 0.2789160907268524, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1818.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 420.9910888671875, + "completions/mean_terminated_length": 420.9910888671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.6231622388444674, + "grad_norm": 0.7409844994544983, + "kl": 0.06365966796875, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 69454067.0, + "reward": 1.3593751192092896, + "reward_std": 0.1385863721370697, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.3202287554740906, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 434.39288330078125, + "completions/mean_terminated_length": 434.39288330078125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.6241939644054681, + "grad_norm": 0.7337531447410583, + "kl": 0.06573486328125, + "learning_rate": 1e-06, + "loss": -0.006, + "num_tokens": 69568001.0, + "reward": 1.2406251430511475, + "reward_std": 0.17304165661334991, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24062500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.30818644165992737, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 995.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 441.90179443359375, + "completions/mean_terminated_length": 441.90179443359375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.6252256899664689, + "grad_norm": 0.8032611608505249, + "kl": 0.0662841796875, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 69683322.0, + "reward": 1.28125, + "reward_std": 0.16203156113624573, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.29279062151908875, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 412.4732360839844, + "completions/mean_terminated_length": 412.4732360839844, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.6262574155274697, + "grad_norm": 0.7155545353889465, + "kl": 0.06622314453125, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 69786912.0, + "reward": 1.3312501907348633, + "reward_std": 0.15303204953670502, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3283267021179199, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 443.21429443359375, + "completions/mean_terminated_length": 443.21429443359375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.6272891410884704, + "grad_norm": 0.5675290822982788, + "kl": 0.0606689453125, + "learning_rate": 1e-06, + "loss": -0.0193, + "num_tokens": 69904991.0, + "reward": 1.2750002145767212, + "reward_std": 0.070383720099926, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 471.83929443359375, + "completions/mean_terminated_length": 471.83929443359375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.6283208666494713, + "grad_norm": 0.760869026184082, + "kl": 0.06317138671875, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 70028656.0, + "reward": 1.240625023841858, + "reward_std": 0.1671050637960434, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24062499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.2780669629573822, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 513.2678833007812, + "completions/mean_terminated_length": 513.2678833007812, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.629352592210472, + "grad_norm": 0.7933395504951477, + "kl": 0.07415771484375, + "learning_rate": 1e-06, + "loss": 0.0297, + "num_tokens": 70160902.0, + "reward": 1.2781251668930054, + "reward_std": 0.18246570229530334, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.304972380399704, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 366.5000305175781, + "completions/mean_terminated_length": 366.5000305175781, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.6303843177714727, + "grad_norm": 0.8295329213142395, + "kl": 0.0723876953125, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 70272286.0, + "reward": 1.3406251668930054, + "reward_std": 0.18557783961296082, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3025068938732147, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 413.7500305175781, + "completions/mean_terminated_length": 413.7500305175781, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.6314160433324736, + "grad_norm": 0.8231275081634521, + "kl": 0.0662841796875, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 70377048.0, + "reward": 1.3906251192092896, + "reward_std": 0.16967806220054626, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3906249701976776, + "rewards/curriculum_aware_reward_fn/std": 0.31073373556137085, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 424.6964416503906, + "completions/mean_terminated_length": 424.6964416503906, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.6324477688934743, + "grad_norm": 0.832097053527832, + "kl": 0.0731201171875, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 70494306.0, + "reward": 1.2875001430511475, + "reward_std": 0.1668517291545868, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3123941123485565, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 480.26788330078125, + "completions/mean_terminated_length": 480.26788330078125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.6334794944544752, + "grad_norm": 0.7808151841163635, + "kl": 0.0631103515625, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 70615478.0, + "reward": 1.1937501430511475, + "reward_std": 0.15479755401611328, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19374999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.26922687888145447, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 449.794677734375, + "completions/mean_terminated_length": 449.794677734375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.6345112200154759, + "grad_norm": 0.7162142395973206, + "kl": 0.0618896484375, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 70734455.0, + "reward": 1.2437500953674316, + "reward_std": 0.13446195423603058, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24374999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.256634920835495, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 450.0000305175781, + "completions/mean_terminated_length": 450.0000305175781, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.6355429455764766, + "grad_norm": 0.7382791042327881, + "kl": 0.06561279296875, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 70849122.0, + "reward": 1.2875001430511475, + "reward_std": 0.18127289414405823, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3123941123485565, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 447.8482360839844, + "completions/mean_terminated_length": 447.8482360839844, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.6365746711374775, + "grad_norm": 0.7692705392837524, + "kl": 0.05877685546875, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 70966614.0, + "reward": 1.1375001668930054, + "reward_std": 0.15338782966136932, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13750000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.22705517709255219, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 447.919677734375, + "completions/mean_terminated_length": 447.919677734375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.6376063966984782, + "grad_norm": 0.7012402415275574, + "kl": 0.05401611328125, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 71083981.0, + "reward": 1.3375002145767212, + "reward_std": 0.14874815940856934, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.2816057503223419, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 466.1250305175781, + "completions/mean_terminated_length": 466.1250305175781, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.638638122259479, + "grad_norm": 0.6417734026908875, + "kl": 0.0552978515625, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 71207300.0, + "reward": 1.2468751668930054, + "reward_std": 0.12970300018787384, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2994951903820038, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1298.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 463.857177734375, + "completions/mean_terminated_length": 463.857177734375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.6396698478204798, + "grad_norm": 0.635471522808075, + "kl": 0.05859375, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 71336189.0, + "reward": 1.2562501430511475, + "reward_std": 0.15377187728881836, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2562499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3288065195083618, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1085.0, + "completions/max_terminated_length": 1085.0, + "completions/mean_length": 454.8214416503906, + "completions/mean_terminated_length": 454.8214416503906, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.6407015733814805, + "grad_norm": 0.6547529101371765, + "kl": 0.0552978515625, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 71458634.0, + "reward": 1.2000000476837158, + "reward_std": 0.14462588727474213, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20000001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.29231905937194824, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 370.3214416503906, + "completions/mean_terminated_length": 370.3214416503906, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.6417332989424813, + "grad_norm": 0.7429718375205994, + "kl": 0.0694580078125, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 71569389.0, + "reward": 1.2750000953674316, + "reward_std": 0.1319045126438141, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3096059560775757, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 381.9821472167969, + "completions/mean_terminated_length": 381.9821472167969, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.642765024503482, + "grad_norm": 0.9691736102104187, + "kl": 0.0660400390625, + "learning_rate": 1e-06, + "loss": 0.0141, + "num_tokens": 71678482.0, + "reward": 1.3718751668930054, + "reward_std": 0.18426603078842163, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37187501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3090803921222687, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 405.5535888671875, + "completions/mean_terminated_length": 405.5535888671875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.6437967500644829, + "grad_norm": 0.809437096118927, + "kl": 0.06268310546875, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 71792245.0, + "reward": 1.3125, + "reward_std": 0.14292655885219574, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.33006277680397034, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 448.2857360839844, + "completions/mean_terminated_length": 448.2857360839844, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.6448284756254836, + "grad_norm": 0.7424851059913635, + "kl": 0.05975341796875, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 71915498.0, + "reward": 1.2687500715255737, + "reward_std": 0.1576671600341797, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26874998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3007591664791107, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 393.0357360839844, + "completions/mean_terminated_length": 393.0357360839844, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.6458602011864844, + "grad_norm": 0.8500249981880188, + "kl": 0.0582275390625, + "learning_rate": 1e-06, + "loss": -0.0197, + "num_tokens": 72026235.0, + "reward": 1.3375002145767212, + "reward_std": 0.1441763937473297, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3374999463558197, + "rewards/curriculum_aware_reward_fn/std": 0.3005625307559967, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 405.6339416503906, + "completions/mean_terminated_length": 405.6339416503906, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.6468919267474852, + "grad_norm": 0.662321925163269, + "kl": 0.069580078125, + "learning_rate": 1e-06, + "loss": 0.0308, + "num_tokens": 72142373.0, + "reward": 1.2937501668930054, + "reward_std": 0.13335785269737244, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.31709015369415283, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 378.1250305175781, + "completions/mean_terminated_length": 378.1250305175781, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.6479236523084859, + "grad_norm": 0.5981107950210571, + "kl": 0.06591796875, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 72251867.0, + "reward": 1.2125000953674316, + "reward_std": 0.07582376897335052, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.3827061057090759, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 358.5535888671875, + "completions/mean_terminated_length": 358.5535888671875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.6489553778694868, + "grad_norm": 0.9436933994293213, + "kl": 0.06439208984375, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 72345008.0, + "reward": 1.3625000715255737, + "reward_std": 0.1505720466375351, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3005625307559967, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 400.4821472167969, + "completions/mean_terminated_length": 367.189208984375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.6499871034304875, + "grad_norm": 0.5673825740814209, + "kl": 0.0594482421875, + "learning_rate": 1e-06, + "loss": 0.0438, + "num_tokens": 72450139.0, + "reward": 1.378571629524231, + "reward_std": 0.089703768491745, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.38749998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3267020583152771, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 416.9464416503906, + "completions/mean_terminated_length": 416.9464416503906, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.6510188289914882, + "grad_norm": 0.6781731247901917, + "kl": 0.057373046875, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 72560537.0, + "reward": 1.2093751430511475, + "reward_std": 0.1222841814160347, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20937499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.2988364100456238, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 455.1250305175781, + "completions/mean_terminated_length": 455.1250305175781, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.6520505545524891, + "grad_norm": 0.7097309231758118, + "kl": 0.05877685546875, + "learning_rate": 1e-06, + "loss": -0.012, + "num_tokens": 72678774.0, + "reward": 1.1687501668930054, + "reward_std": 0.13737669587135315, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16875001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.2738715410232544, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 399.3125305175781, + "completions/mean_terminated_length": 399.3125305175781, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.6530822801134898, + "grad_norm": 0.6966571807861328, + "kl": 0.05914306640625, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 72794778.0, + "reward": 1.2218750715255737, + "reward_std": 0.1375676989555359, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.300808310508728, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 408.9821472167969, + "completions/mean_terminated_length": 408.9821472167969, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.6541140056744906, + "grad_norm": 0.677052915096283, + "kl": 0.06536865234375, + "learning_rate": 1e-06, + "loss": -0.0176, + "num_tokens": 72907343.0, + "reward": 1.3156250715255737, + "reward_std": 0.14504189789295197, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3114938735961914, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 427.0982360839844, + "completions/mean_terminated_length": 427.0982360839844, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.6551457312354914, + "grad_norm": 0.7582041621208191, + "kl": 0.0594482421875, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 73027782.0, + "reward": 1.1968750953674316, + "reward_std": 0.14314253628253937, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2730608284473419, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 393.33929443359375, + "completions/mean_terminated_length": 393.33929443359375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.6561774567964921, + "grad_norm": 0.832430362701416, + "kl": 0.0611572265625, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 73141605.0, + "reward": 1.2750000953674316, + "reward_std": 0.1692698895931244, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2749999761581421, + "rewards/curriculum_aware_reward_fn/std": 0.302392840385437, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1235.0, + "completions/max_terminated_length": 1235.0, + "completions/mean_length": 443.5357360839844, + "completions/mean_terminated_length": 443.5357360839844, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.6572091823574929, + "grad_norm": 0.7490401268005371, + "kl": 0.05474853515625, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 73264844.0, + "reward": 1.1906250715255737, + "reward_std": 0.13961127400398254, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19062499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.27749940752983093, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 397.8125305175781, + "completions/mean_terminated_length": 397.8125305175781, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.6582409079184937, + "grad_norm": 0.6362108588218689, + "kl": 0.0712890625, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 73380897.0, + "reward": 1.1968750953674316, + "reward_std": 0.1181226521730423, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.28102782368659973, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 378.5446472167969, + "completions/mean_terminated_length": 378.5446472167969, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.6592726334794945, + "grad_norm": 0.8100889325141907, + "kl": 0.0693359375, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 73497585.0, + "reward": 1.2156251668930054, + "reward_std": 0.15342311561107635, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2867204546928406, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1265.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 457.0714416503906, + "completions/mean_terminated_length": 457.0714416503906, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.6603043590404952, + "grad_norm": 0.6374899744987488, + "kl": 0.0521240234375, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 73629291.0, + "reward": 1.2781251668930054, + "reward_std": 0.1284002810716629, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.32596227526664734, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 385.7946472167969, + "completions/mean_terminated_length": 385.7946472167969, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.661336084601496, + "grad_norm": 0.7600776553153992, + "kl": 0.072021484375, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 73740064.0, + "reward": 1.2218750715255737, + "reward_std": 0.12513509392738342, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.28959283232688904, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 418.40179443359375, + "completions/mean_terminated_length": 418.40179443359375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.6623678101624968, + "grad_norm": 0.6850177645683289, + "kl": 0.0572509765625, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 73855861.0, + "reward": 1.371875286102295, + "reward_std": 0.15071289241313934, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37187501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3364347517490387, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 375.8571472167969, + "completions/mean_terminated_length": 375.8571472167969, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.6633995357234975, + "grad_norm": 0.7585674524307251, + "kl": 0.058349609375, + "learning_rate": 1e-06, + "loss": -0.0225, + "num_tokens": 73962987.0, + "reward": 1.3531252145767212, + "reward_std": 0.1872883141040802, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3271692097187042, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1148.0, + "completions/max_terminated_length": 1148.0, + "completions/mean_length": 409.4375305175781, + "completions/mean_terminated_length": 409.4375305175781, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.6644312612844984, + "grad_norm": 0.6967830061912537, + "kl": 0.05999755859375, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 74079950.0, + "reward": 1.2312501668930054, + "reward_std": 0.13037815690040588, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23124997317790985, + "rewards/curriculum_aware_reward_fn/std": 0.28790363669395447, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 408.7232360839844, + "completions/mean_terminated_length": 408.7232360839844, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.6654629868454991, + "grad_norm": 0.6875054240226746, + "kl": 0.0599365234375, + "learning_rate": 1e-06, + "loss": -0.0128, + "num_tokens": 74191471.0, + "reward": 1.309375286102295, + "reward_std": 0.11628562211990356, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3177575469017029, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 397.5446472167969, + "completions/mean_terminated_length": 397.5446472167969, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.6664947124064998, + "grad_norm": 0.6609368920326233, + "kl": 0.064208984375, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 74306584.0, + "reward": 1.2375000715255737, + "reward_std": 0.1822759509086609, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23749999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.3193814158439636, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 392.6696472167969, + "completions/mean_terminated_length": 392.6696472167969, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.6675264379675007, + "grad_norm": 0.8001618385314941, + "kl": 0.07421875, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 74416706.0, + "reward": 1.3500001430511475, + "reward_std": 0.15462636947631836, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.30082467198371887, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 478.0625305175781, + "completions/mean_terminated_length": 445.4684753417969, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.6685581635285014, + "grad_norm": 0.7765113115310669, + "kl": 0.0609130859375, + "learning_rate": 1e-06, + "loss": 0.0444, + "num_tokens": 74536665.0, + "reward": 1.2191965579986572, + "reward_std": 0.20141837000846863, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.27678829431533813, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1479.0, + "completions/max_terminated_length": 1479.0, + "completions/mean_length": 422.0982360839844, + "completions/mean_terminated_length": 422.0982360839844, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.6695898890895022, + "grad_norm": 0.6240831017494202, + "kl": 0.06243896484375, + "learning_rate": 1e-06, + "loss": -0.0382, + "num_tokens": 74654178.0, + "reward": 1.3718751668930054, + "reward_std": 0.08518895506858826, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37187498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3126305937767029, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 387.9732360839844, + "completions/mean_terminated_length": 387.9732360839844, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.670621614650503, + "grad_norm": 0.8223195672035217, + "kl": 0.06793212890625, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 74756662.0, + "reward": 1.4410717487335205, + "reward_std": 0.2007153332233429, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.44999998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3166548013687134, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 432.2589416503906, + "completions/mean_terminated_length": 432.2589416503906, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.6716533402115037, + "grad_norm": 0.5505730509757996, + "kl": 0.06182861328125, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 74881165.0, + "reward": 1.1875, + "reward_std": 0.0919622927904129, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.2854978144168854, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 821.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 416.5714416503906, + "completions/mean_terminated_length": 416.5714416503906, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.6726850657725045, + "grad_norm": 0.5758817791938782, + "kl": 0.06842041015625, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 74989302.0, + "reward": 1.2375000715255737, + "reward_std": 0.11903390288352966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23749999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.2979282736778259, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 394.2857360839844, + "completions/mean_terminated_length": 394.2857360839844, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.6737167913335053, + "grad_norm": 0.635651171207428, + "kl": 0.0662841796875, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 75100564.0, + "reward": 1.1660715341567993, + "reward_std": 0.15635573863983154, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2896098494529724, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 371.95538330078125, + "completions/mean_terminated_length": 371.95538330078125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.6747485168945061, + "grad_norm": 0.6044211387634277, + "kl": 0.06475830078125, + "learning_rate": 1e-06, + "loss": -0.02, + "num_tokens": 75211943.0, + "reward": 1.3000000715255737, + "reward_std": 0.12482405453920364, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30000001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.32159513235092163, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 343.58929443359375, + "completions/mean_terminated_length": 343.58929443359375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.6757802424555068, + "grad_norm": 0.8712995052337646, + "kl": 0.067138671875, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 75312683.0, + "reward": 1.325000286102295, + "reward_std": 0.16854573786258698, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3211045265197754, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1504.0, + "completions/max_terminated_length": 1504.0, + "completions/mean_length": 459.7500305175781, + "completions/mean_terminated_length": 459.7500305175781, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.6768119680165076, + "grad_norm": 0.8327785134315491, + "kl": 0.0638427734375, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 75435694.0, + "reward": 1.2375000715255737, + "reward_std": 0.17822317779064178, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23749998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2979282736778259, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 390.7232360839844, + "completions/mean_terminated_length": 390.7232360839844, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.6778436935775084, + "grad_norm": 0.580292284488678, + "kl": 0.06488037109375, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 75545927.0, + "reward": 1.343750238418579, + "reward_std": 0.08745487779378891, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.2970671057701111, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 421.8571472167969, + "completions/mean_terminated_length": 421.8571472167969, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.6788754191385091, + "grad_norm": 0.5890258550643921, + "kl": 0.071533203125, + "learning_rate": 1e-06, + "loss": -0.0057, + "num_tokens": 75664295.0, + "reward": 1.1437500715255737, + "reward_std": 0.11732659488916397, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.2639039158821106, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 415.89288330078125, + "completions/mean_terminated_length": 415.89288330078125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.67990714469951, + "grad_norm": 0.6905220746994019, + "kl": 0.06536865234375, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 75778712.0, + "reward": 1.312500238418579, + "reward_std": 0.1588037759065628, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.3021320700645447, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1382.0, + "completions/max_terminated_length": 1382.0, + "completions/mean_length": 440.4910888671875, + "completions/mean_terminated_length": 440.4910888671875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.6809388702605107, + "grad_norm": 0.7469112277030945, + "kl": 0.06982421875, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 75902331.0, + "reward": 1.403125286102295, + "reward_std": 0.13282155990600586, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3193660080432892, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 421.3125305175781, + "completions/mean_terminated_length": 421.3125305175781, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.6819705958215114, + "grad_norm": 0.7019675970077515, + "kl": 0.0772705078125, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 76018650.0, + "reward": 1.25, + "reward_std": 0.14943593740463257, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2499999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.32012102007865906, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 436.3214416503906, + "completions/mean_terminated_length": 436.3214416503906, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.6830023213825123, + "grad_norm": 0.6849479079246521, + "kl": 0.0711669921875, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 76146828.0, + "reward": 1.2281250953674316, + "reward_std": 0.12466341257095337, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2885020077228546, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 432.1696472167969, + "completions/mean_terminated_length": 432.1696472167969, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.684034046943513, + "grad_norm": 0.6404550671577454, + "kl": 0.07568359375, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 76259113.0, + "reward": 1.3937500715255737, + "reward_std": 0.12282286584377289, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.38203608989715576, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 444.96429443359375, + "completions/mean_terminated_length": 444.96429443359375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.6850657725045138, + "grad_norm": 0.602090060710907, + "kl": 0.0660400390625, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 76381840.0, + "reward": 1.171875, + "reward_std": 0.10328420996665955, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.26574569940567017, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 415.9285888671875, + "completions/mean_terminated_length": 415.9285888671875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.6860974980655146, + "grad_norm": 0.6655356884002686, + "kl": 0.0723876953125, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 76496856.0, + "reward": 1.3187501430511475, + "reward_std": 0.1583477109670639, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.32055166363716125, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 388.5625305175781, + "completions/mean_terminated_length": 388.5625305175781, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.6871292236265153, + "grad_norm": 0.825885534286499, + "kl": 0.075927734375, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 76604285.0, + "reward": 1.418750286102295, + "reward_std": 0.1810518056154251, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41875001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.31106650829315186, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 420.08929443359375, + "completions/mean_terminated_length": 420.08929443359375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.6881609491875161, + "grad_norm": 0.6745756268501282, + "kl": 0.08447265625, + "learning_rate": 1e-06, + "loss": 0.0279, + "num_tokens": 76715175.0, + "reward": 1.2843750715255737, + "reward_std": 0.138127863407135, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.32390013337135315, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1529.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 478.8750305175781, + "completions/mean_terminated_length": 478.8750305175781, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.6891926747485169, + "grad_norm": 0.7624357342720032, + "kl": 0.0677490234375, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 76839005.0, + "reward": 1.2750000953674316, + "reward_std": 0.2256644070148468, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3466051518917084, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 441.8035888671875, + "completions/mean_terminated_length": 441.8035888671875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.6902244003095177, + "grad_norm": 0.6996376514434814, + "kl": 0.07421875, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 76950346.0, + "reward": 1.3218752145767212, + "reward_std": 0.13565517961978912, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31212589144706726, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 488.27679443359375, + "completions/mean_terminated_length": 488.27679443359375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.6912561258705184, + "grad_norm": 0.5711653232574463, + "kl": 0.07177734375, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 77079044.0, + "reward": 1.2129465341567993, + "reward_std": 0.14019827544689178, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3044550120830536, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1641.0, + "completions/max_terminated_length": 1641.0, + "completions/mean_length": 510.6964416503906, + "completions/mean_terminated_length": 510.6964416503906, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.6922878514315192, + "grad_norm": 0.6841708421707153, + "kl": 0.06170654296875, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 77211828.0, + "reward": 1.1750000715255737, + "reward_std": 0.1548759937286377, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.25300002098083496, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 465.857177734375, + "completions/mean_terminated_length": 465.857177734375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.69331957699252, + "grad_norm": 0.6161367893218994, + "kl": 0.07373046875, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 77343216.0, + "reward": 1.2437502145767212, + "reward_std": 0.1301034688949585, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24374999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.3180830180644989, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 434.6785888671875, + "completions/mean_terminated_length": 434.6785888671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.6943513025535207, + "grad_norm": 0.7548561096191406, + "kl": 0.07275390625, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 77455656.0, + "reward": 1.3468750715255737, + "reward_std": 0.1485518515110016, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2914920449256897, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 464.14288330078125, + "completions/mean_terminated_length": 464.14288330078125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.6953830281145216, + "grad_norm": 0.784842312335968, + "kl": 0.071533203125, + "learning_rate": 1e-06, + "loss": 0.0229, + "num_tokens": 77573895.0, + "reward": 1.3250001668930054, + "reward_std": 0.22508502006530762, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3245232105255127, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 461.7589416503906, + "completions/mean_terminated_length": 461.7589416503906, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.6964147536755223, + "grad_norm": 0.6696816086769104, + "kl": 0.0767822265625, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 77698622.0, + "reward": 1.171875, + "reward_std": 0.10575476288795471, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.2615598738193512, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1178.0, + "completions/max_terminated_length": 1178.0, + "completions/mean_length": 535.7053833007812, + "completions/mean_terminated_length": 535.7053833007812, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.697446479236523, + "grad_norm": 0.6169516444206238, + "kl": 0.065673828125, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 77841053.0, + "reward": 1.1812500953674316, + "reward_std": 0.153910830616951, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.28181561827659607, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 446.732177734375, + "completions/mean_terminated_length": 446.732177734375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.6984782047975239, + "grad_norm": 0.6587204933166504, + "kl": 0.0693359375, + "learning_rate": 1e-06, + "loss": 0.0321, + "num_tokens": 77962396.0, + "reward": 1.2687500715255737, + "reward_std": 0.14465731382369995, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26875001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.32543283700942993, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 489.3125305175781, + "completions/mean_terminated_length": 489.3125305175781, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.6995099303585246, + "grad_norm": 0.48290061950683594, + "kl": 0.066650390625, + "learning_rate": 1e-06, + "loss": -0.0171, + "num_tokens": 78094500.0, + "reward": 1.1218751668930054, + "reward_std": 0.07095565646886826, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.22388675808906555, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 459.2410888671875, + "completions/mean_terminated_length": 459.2410888671875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.7005416559195254, + "grad_norm": 0.7850268483161926, + "kl": 0.070556640625, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 78214591.0, + "reward": 1.2843750715255737, + "reward_std": 0.18927761912345886, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3027673363685608, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1009.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 484.6160888671875, + "completions/mean_terminated_length": 484.6160888671875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.7015733814805262, + "grad_norm": 0.6543923020362854, + "kl": 0.0738525390625, + "learning_rate": 1e-06, + "loss": -0.0326, + "num_tokens": 78343796.0, + "reward": 1.2254464626312256, + "reward_std": 0.18909485638141632, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.234375, + "rewards/curriculum_aware_reward_fn/std": 0.2948530614376068, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1448.0, + "completions/max_terminated_length": 1448.0, + "completions/mean_length": 520.3928833007812, + "completions/mean_terminated_length": 520.3928833007812, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.7026051070415269, + "grad_norm": 0.7107188701629639, + "kl": 0.0721435546875, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 78476322.0, + "reward": 1.2410715818405151, + "reward_std": 0.15162546932697296, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.28742408752441406, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1740.0, + "completions/max_terminated_length": 1740.0, + "completions/mean_length": 584.9553833007812, + "completions/mean_terminated_length": 584.9553833007812, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.7036368326025277, + "grad_norm": 0.7118996381759644, + "kl": 0.063232421875, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 78611522.0, + "reward": 1.2468751668930054, + "reward_std": 0.17215028405189514, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.30315765738487244, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 475.76788330078125, + "completions/mean_terminated_length": 475.76788330078125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.7046685581635285, + "grad_norm": 0.7987418174743652, + "kl": 0.078125, + "learning_rate": 1e-06, + "loss": 0.0297, + "num_tokens": 78730902.0, + "reward": 1.3218750953674316, + "reward_std": 0.2143011838197708, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.29764699935913086, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1221.0, + "completions/max_terminated_length": 1221.0, + "completions/mean_length": 487.83038330078125, + "completions/mean_terminated_length": 487.83038330078125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.7057002837245293, + "grad_norm": 0.7103044986724854, + "kl": 0.070556640625, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 78849337.0, + "reward": 1.2875001430511475, + "reward_std": 0.16721250116825104, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.2979282736778259, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1761.0, + "completions/max_terminated_length": 1761.0, + "completions/mean_length": 539.0, + "completions/mean_terminated_length": 539.0, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.70673200928553, + "grad_norm": 0.4926302134990692, + "kl": 0.0657958984375, + "learning_rate": 1e-06, + "loss": 0.0254, + "num_tokens": 78977080.0, + "reward": 1.296875, + "reward_std": 0.11531693488359451, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3052307665348053, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1386.0, + "completions/max_terminated_length": 1386.0, + "completions/mean_length": 535.2232666015625, + "completions/mean_terminated_length": 535.2232666015625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.7077637348465308, + "grad_norm": 0.9094158411026001, + "kl": 0.0721435546875, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 79110476.0, + "reward": 1.28125, + "reward_std": 0.13001351058483124, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.32832667231559753, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 448.3035888671875, + "completions/mean_terminated_length": 448.3035888671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7087954604075316, + "grad_norm": 0.7349770069122314, + "kl": 0.07373046875, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 79221187.0, + "reward": 1.3508931398391724, + "reward_std": 0.2023835927248001, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.3687500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30023449659347534, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 497.8125305175781, + "completions/mean_terminated_length": 497.8125305175781, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.7098271859685323, + "grad_norm": 0.5558754205703735, + "kl": 0.0653076171875, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 79336057.0, + "reward": 1.21875, + "reward_std": 0.11050201207399368, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21875, + "rewards/curriculum_aware_reward_fn/std": 0.32251298427581787, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 486.3750305175781, + "completions/mean_terminated_length": 486.3750305175781, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.7108589115295332, + "grad_norm": 0.6595085263252258, + "kl": 0.079833984375, + "learning_rate": 1e-06, + "loss": 0.0301, + "num_tokens": 79465298.0, + "reward": 1.4093750715255737, + "reward_std": 0.14184318482875824, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40937498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 480.01788330078125, + "completions/mean_terminated_length": 480.01788330078125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.7118906370905339, + "grad_norm": 0.7275698781013489, + "kl": 0.0732421875, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 79579050.0, + "reward": 1.296875238418579, + "reward_std": 0.21182642877101898, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3123783469200134, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1623.0, + "completions/max_terminated_length": 1623.0, + "completions/mean_length": 486.669677734375, + "completions/mean_terminated_length": 486.669677734375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.7129223626515347, + "grad_norm": 0.7305976748466492, + "kl": 0.076416015625, + "learning_rate": 1e-06, + "loss": 0.0256, + "num_tokens": 79693022.0, + "reward": 1.3004463911056519, + "reward_std": 0.1608843356370926, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.32121187448501587, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1125.0, + "completions/max_terminated_length": 1125.0, + "completions/mean_length": 468.2589416503906, + "completions/mean_terminated_length": 468.2589416503906, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.7139540882125355, + "grad_norm": 0.7382761836051941, + "kl": 0.0733642578125, + "learning_rate": 1e-06, + "loss": 0.0329, + "num_tokens": 79808166.0, + "reward": 1.375000238418579, + "reward_std": 0.15719659626483917, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.375, + "rewards/curriculum_aware_reward_fn/std": 0.3106227219104767, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 423.52679443359375, + "completions/mean_terminated_length": 423.52679443359375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.7149858137735362, + "grad_norm": 0.7762972712516785, + "kl": 0.0870361328125, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 79922514.0, + "reward": 1.3593751192092896, + "reward_std": 0.18821999430656433, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.33702000975608826, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 455.6607360839844, + "completions/mean_terminated_length": 455.6607360839844, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.716017539334537, + "grad_norm": 0.6777034997940063, + "kl": 0.0794677734375, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 80039530.0, + "reward": 1.359375238418579, + "reward_std": 0.1436331868171692, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.33702000975608826, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 424.7232360839844, + "completions/mean_terminated_length": 424.7232360839844, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.7170492648955378, + "grad_norm": 0.6738489866256714, + "kl": 0.07177734375, + "learning_rate": 1e-06, + "loss": 0.0337, + "num_tokens": 80153732.0, + "reward": 1.4812501668930054, + "reward_std": 0.17275573313236237, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48124998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.37621423602104187, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1146.0, + "completions/max_terminated_length": 1146.0, + "completions/mean_length": 502.5357360839844, + "completions/mean_terminated_length": 502.5357360839844, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.7180809904565386, + "grad_norm": 0.7249394059181213, + "kl": 0.05865478515625, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 80277055.0, + "reward": 1.3781250715255737, + "reward_std": 0.15655311942100525, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.31911906599998474, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 483.26788330078125, + "completions/mean_terminated_length": 483.26788330078125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.7191127160175393, + "grad_norm": 0.6074086427688599, + "kl": 0.06689453125, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 80405122.0, + "reward": 1.296875, + "reward_std": 0.08536109328269958, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3052307665348053, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 482.232177734375, + "completions/mean_terminated_length": 482.232177734375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.7201444415785401, + "grad_norm": 0.571091890335083, + "kl": 0.078125, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 80536384.0, + "reward": 1.2062500715255737, + "reward_std": 0.11699223518371582, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20624999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.291711688041687, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 490.2410888671875, + "completions/mean_terminated_length": 490.2410888671875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.7211761671395409, + "grad_norm": 0.7024215459823608, + "kl": 0.0699462890625, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 80666373.0, + "reward": 1.25, + "reward_std": 0.167774498462677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.3060206472873688, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 434.8839416503906, + "completions/mean_terminated_length": 434.8839416503906, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7222078927005416, + "grad_norm": 0.7631708383560181, + "kl": 0.0819091796875, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 80787369.0, + "reward": 1.3125, + "reward_std": 0.16144998371601105, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.31987470388412476, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 440.8750305175781, + "completions/mean_terminated_length": 440.8750305175781, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.7232396182615425, + "grad_norm": 0.7090742588043213, + "kl": 0.080322265625, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 80901896.0, + "reward": 1.3468750715255737, + "reward_std": 0.1540508270263672, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 463.01788330078125, + "completions/mean_terminated_length": 463.01788330078125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.7242713438225432, + "grad_norm": 0.7182156443595886, + "kl": 0.0772705078125, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 81022494.0, + "reward": 1.3312500715255737, + "reward_std": 0.1629144847393036, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3249480128288269, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 460.4285888671875, + "completions/mean_terminated_length": 460.4285888671875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.7253030693835439, + "grad_norm": 0.6034790277481079, + "kl": 0.0723876953125, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 81138560.0, + "reward": 1.2531250715255737, + "reward_std": 0.10742086172103882, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3228031396865845, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 455.27679443359375, + "completions/mean_terminated_length": 455.27679443359375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.7263347949445448, + "grad_norm": 0.6957827806472778, + "kl": 0.076171875, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 81260041.0, + "reward": 1.4406250715255737, + "reward_std": 0.1609530746936798, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29243704676628113, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 494.1785888671875, + "completions/mean_terminated_length": 494.1785888671875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.7273665205055455, + "grad_norm": 0.6472601294517517, + "kl": 0.0770263671875, + "learning_rate": 1e-06, + "loss": 0.0421, + "num_tokens": 81386292.0, + "reward": 1.2625001668930054, + "reward_std": 0.1701442450284958, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26250001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.302653431892395, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 435.76788330078125, + "completions/mean_terminated_length": 435.76788330078125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.7283982460665464, + "grad_norm": 0.6905910968780518, + "kl": 0.09326171875, + "learning_rate": 1e-06, + "loss": -0.0209, + "num_tokens": 81497958.0, + "reward": 1.3312500715255737, + "reward_std": 0.11016502231359482, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.32832667231559753, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 417.83929443359375, + "completions/mean_terminated_length": 417.83929443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.7294299716275471, + "grad_norm": 0.7361639738082886, + "kl": 0.082275390625, + "learning_rate": 1e-06, + "loss": 0.0271, + "num_tokens": 81609688.0, + "reward": 1.3875001668930054, + "reward_std": 0.17972160875797272, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38749998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3968626856803894, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1104.0, + "completions/max_terminated_length": 1104.0, + "completions/mean_length": 489.4375305175781, + "completions/mean_terminated_length": 489.4375305175781, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.7304616971885478, + "grad_norm": 0.5894014239311218, + "kl": 0.077880859375, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 81734190.0, + "reward": 1.25, + "reward_std": 0.13956649601459503, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2499999850988388, + "rewards/curriculum_aware_reward_fn/std": 0.3369176685810089, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 424.9821472167969, + "completions/mean_terminated_length": 424.9821472167969, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.7314934227495486, + "grad_norm": 0.7483744621276855, + "kl": 0.0867919921875, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 81851144.0, + "reward": 1.3937500715255737, + "reward_std": 0.15640275180339813, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39375001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.29759734869003296, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 415.7321472167969, + "completions/mean_terminated_length": 415.7321472167969, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.7325251483105494, + "grad_norm": 0.620214581489563, + "kl": 0.08349609375, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 81966327.0, + "reward": 1.271875023841858, + "reward_std": 0.1221185103058815, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.29975825548171997, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1323.0, + "completions/max_terminated_length": 1323.0, + "completions/mean_length": 478.6964416503906, + "completions/mean_terminated_length": 478.6964416503906, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.7335568738715502, + "grad_norm": 0.6635290384292603, + "kl": 0.0855712890625, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 82089682.0, + "reward": 1.2843750715255737, + "reward_std": 0.14508238434791565, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3063907325267792, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 422.2589416503906, + "completions/mean_terminated_length": 422.2589416503906, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.734588599432551, + "grad_norm": 0.7747433185577393, + "kl": 0.0855712890625, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 82209649.0, + "reward": 1.3093751668930054, + "reward_std": 0.14272122085094452, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3071615993976593, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 475.58038330078125, + "completions/mean_terminated_length": 475.58038330078125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.7356203249935517, + "grad_norm": 0.6959653496742249, + "kl": 0.0831298828125, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 82323646.0, + "reward": 1.1906250715255737, + "reward_std": 0.11970683187246323, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19062498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.23904192447662354, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 404.7946472167969, + "completions/mean_terminated_length": 404.7946472167969, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.7366520505545525, + "grad_norm": 0.7410394549369812, + "kl": 0.0875244140625, + "learning_rate": 1e-06, + "loss": 0.0261, + "num_tokens": 82431325.0, + "reward": 1.3906251192092896, + "reward_std": 0.1362684816122055, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.390625, + "rewards/curriculum_aware_reward_fn/std": 0.353905588388443, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 425.1696472167969, + "completions/mean_terminated_length": 425.1696472167969, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.7376837761155532, + "grad_norm": 0.7336283922195435, + "kl": 0.091064453125, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 82544219.0, + "reward": 1.343750238418579, + "reward_std": 0.1931782364845276, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.3354521691799164, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 437.21429443359375, + "completions/mean_terminated_length": 437.21429443359375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7387155016765541, + "grad_norm": 0.6753544211387634, + "kl": 0.0867919921875, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 82653919.0, + "reward": 1.3343751430511475, + "reward_std": 0.13460318744182587, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3343749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.2910861372947693, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 502.1785888671875, + "completions/mean_terminated_length": 502.1785888671875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.7397472272375548, + "grad_norm": 0.751768171787262, + "kl": 0.0675048828125, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 82781227.0, + "reward": 1.250000238418579, + "reward_std": 0.1571681946516037, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.2835584580898285, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1129.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 498.8660888671875, + "completions/mean_terminated_length": 498.8660888671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7407789527985555, + "grad_norm": 0.8301697969436646, + "kl": 0.08447265625, + "learning_rate": 1e-06, + "loss": -0.0168, + "num_tokens": 82899358.0, + "reward": 1.3062500953674316, + "reward_std": 0.18210452795028687, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30852195620536804, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 455.982177734375, + "completions/mean_terminated_length": 455.982177734375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.7418106783595564, + "grad_norm": 0.5934935808181763, + "kl": 0.087890625, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 83013932.0, + "reward": 1.2937501668930054, + "reward_std": 0.13134068250656128, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.2991824150085449, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 481.5535888671875, + "completions/mean_terminated_length": 481.5535888671875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.7428424039205571, + "grad_norm": 0.720224142074585, + "kl": 0.0858154296875, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 83130726.0, + "reward": 1.171875238418579, + "reward_std": 0.1682029515504837, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.2529805302619934, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 397.5357360839844, + "completions/mean_terminated_length": 397.5357360839844, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.743874129481558, + "grad_norm": 0.6368864178657532, + "kl": 0.0858154296875, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 83241722.0, + "reward": 1.3781250715255737, + "reward_std": 0.13773974776268005, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.32933053374290466, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 437.5535888671875, + "completions/mean_terminated_length": 437.5535888671875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.7449058550425587, + "grad_norm": 0.8228105306625366, + "kl": 0.0982666015625, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 83361302.0, + "reward": 1.281250238418579, + "reward_std": 0.1560710370540619, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.3145943284034729, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 472.0714416503906, + "completions/mean_terminated_length": 472.0714416503906, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.7459375806035594, + "grad_norm": 0.7197965383529663, + "kl": 0.08154296875, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 83482396.0, + "reward": 1.2906252145767212, + "reward_std": 0.12630914151668549, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.27349352836608887, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 475.40179443359375, + "completions/mean_terminated_length": 475.40179443359375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.7469693061645603, + "grad_norm": 0.6598973870277405, + "kl": 0.079833984375, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 83611927.0, + "reward": 1.2906252145767212, + "reward_std": 0.1274426281452179, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.29671862721443176, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 394.26788330078125, + "completions/mean_terminated_length": 394.26788330078125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.748001031725561, + "grad_norm": 0.6687096357345581, + "kl": 0.08544921875, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 83714619.0, + "reward": 1.328125, + "reward_std": 0.13225266337394714, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.328125, + "rewards/curriculum_aware_reward_fn/std": 0.32980892062187195, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 422.4910888671875, + "completions/mean_terminated_length": 422.4910888671875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.7490327572865618, + "grad_norm": 0.6657594442367554, + "kl": 0.0855712890625, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 83824105.0, + "reward": 1.2531250715255737, + "reward_std": 0.08941584080457687, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2707414925098419, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 463.9464416503906, + "completions/mean_terminated_length": 463.9464416503906, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.7500644828475626, + "grad_norm": 0.6711493134498596, + "kl": 0.0826416015625, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 83943428.0, + "reward": 1.2250001430511475, + "reward_std": 0.16174761950969696, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.34156501293182373, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 496.3750305175781, + "completions/mean_terminated_length": 496.3750305175781, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.7510962084085633, + "grad_norm": 0.7064552903175354, + "kl": 0.07373046875, + "learning_rate": 1e-06, + "loss": -0.0308, + "num_tokens": 84066124.0, + "reward": 1.171875, + "reward_std": 0.1559474766254425, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.2529805302619934, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 389.96429443359375, + "completions/mean_terminated_length": 389.96429443359375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.7521279339695641, + "grad_norm": 0.5838387608528137, + "kl": 0.076904296875, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 84173150.0, + "reward": 1.3218752145767212, + "reward_std": 0.11716850101947784, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218749463558197, + "rewards/curriculum_aware_reward_fn/std": 0.33266472816467285, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 438.89288330078125, + "completions/mean_terminated_length": 438.89288330078125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.7531596595305649, + "grad_norm": 0.732061505317688, + "kl": 0.072998046875, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 84285803.0, + "reward": 1.278571605682373, + "reward_std": 0.18561238050460815, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.30524691939353943, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 416.6071472167969, + "completions/mean_terminated_length": 416.6071472167969, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.7541913850915657, + "grad_norm": 0.781933605670929, + "kl": 0.075439453125, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 84391494.0, + "reward": 1.1968752145767212, + "reward_std": 0.1363893300294876, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2730608284473419, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1764.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 439.9375305175781, + "completions/mean_terminated_length": 439.9375305175781, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.7552231106525664, + "grad_norm": 0.5383293628692627, + "kl": 0.077880859375, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 84514316.0, + "reward": 1.1687500476837158, + "reward_std": 0.06947815418243408, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2778719961643219, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 470.7410888671875, + "completions/mean_terminated_length": 470.7410888671875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.7562548362135671, + "grad_norm": 0.7183967232704163, + "kl": 0.07177734375, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 84644116.0, + "reward": 1.2531250715255737, + "reward_std": 0.15776854753494263, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25312501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.29040831327438354, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1185.0, + "completions/max_terminated_length": 1185.0, + "completions/mean_length": 443.5357360839844, + "completions/mean_terminated_length": 443.5357360839844, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.757286561774568, + "grad_norm": 0.6907499432563782, + "kl": 0.080810546875, + "learning_rate": 1e-06, + "loss": 0.0224, + "num_tokens": 84766326.0, + "reward": 1.1437500715255737, + "reward_std": 0.12301648408174515, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.24209344387054443, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 447.9107360839844, + "completions/mean_terminated_length": 447.9107360839844, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.7583182873355687, + "grad_norm": 0.7715913653373718, + "kl": 0.076171875, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 84888634.0, + "reward": 1.1625001430511475, + "reward_std": 0.15382583439350128, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16250000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.26546746492385864, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 434.5446472167969, + "completions/mean_terminated_length": 434.5446472167969, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.7593500128965696, + "grad_norm": 0.7604141235351562, + "kl": 0.0784912109375, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 85000233.0, + "reward": 1.2718751430511475, + "reward_std": 0.1417272835969925, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.31060686707496643, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 448.65179443359375, + "completions/mean_terminated_length": 448.65179443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.7603817384575703, + "grad_norm": 0.7140162587165833, + "kl": 0.083251953125, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 85118886.0, + "reward": 1.3250001668930054, + "reward_std": 0.13391801714897156, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.33785226941108704, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 382.1785888671875, + "completions/mean_terminated_length": 382.1785888671875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.761413464018571, + "grad_norm": 0.7177646160125732, + "kl": 0.0946044921875, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 85226286.0, + "reward": 1.4000000953674316, + "reward_std": 0.12313154339790344, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40000003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.30756235122680664, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1061.0, + "completions/max_terminated_length": 1061.0, + "completions/mean_length": 481.51788330078125, + "completions/mean_terminated_length": 481.51788330078125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.7624451895795719, + "grad_norm": 0.6312434673309326, + "kl": 0.0701904296875, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 85348158.0, + "reward": 1.296875, + "reward_std": 0.12531256675720215, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3228031396865845, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 451.58038330078125, + "completions/mean_terminated_length": 451.58038330078125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.7634769151405726, + "grad_norm": 0.7754645943641663, + "kl": 0.083984375, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 85467622.0, + "reward": 1.2125002145767212, + "reward_std": 0.19816157221794128, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.279357373714447, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 437.4375305175781, + "completions/mean_terminated_length": 437.4375305175781, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.7645086407015734, + "grad_norm": 0.6591800451278687, + "kl": 0.081298828125, + "learning_rate": 1e-06, + "loss": -0.0114, + "num_tokens": 85578607.0, + "reward": 1.281250238418579, + "reward_std": 0.08816681802272797, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28125, + "rewards/curriculum_aware_reward_fn/std": 0.3145943284034729, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 452.51788330078125, + "completions/mean_terminated_length": 452.51788330078125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.7655403662625742, + "grad_norm": 0.76875239610672, + "kl": 0.08154296875, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 85692779.0, + "reward": 1.2562501430511475, + "reward_std": 0.15450942516326904, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2562499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.2933286130428314, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 399.5982360839844, + "completions/mean_terminated_length": 399.5982360839844, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.7665720918235749, + "grad_norm": 0.7755102515220642, + "kl": 0.0784912109375, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 85809613.0, + "reward": 1.421875238418579, + "reward_std": 0.1664501279592514, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.421875, + "rewards/curriculum_aware_reward_fn/std": 0.30856987833976746, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 491.4107360839844, + "completions/mean_terminated_length": 491.4107360839844, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.7676038173845757, + "grad_norm": 0.7311500310897827, + "kl": 0.082763671875, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 85927112.0, + "reward": 1.1906250715255737, + "reward_std": 0.16675879061222076, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19062498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.26530036330223083, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 453.6785888671875, + "completions/mean_terminated_length": 453.6785888671875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.7686355429455765, + "grad_norm": 0.7551841139793396, + "kl": 0.07421875, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 86047273.0, + "reward": 1.296875238418579, + "reward_std": 0.189897820353508, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.28658291697502136, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 471.8750305175781, + "completions/mean_terminated_length": 471.8750305175781, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.7696672685065773, + "grad_norm": 0.7156517505645752, + "kl": 0.07177734375, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 86168023.0, + "reward": 1.171875, + "reward_std": 0.15609991550445557, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1718749850988388, + "rewards/curriculum_aware_reward_fn/std": 0.26986658573150635, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 411.3571472167969, + "completions/mean_terminated_length": 411.3571472167969, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.770698994067578, + "grad_norm": 0.7727649211883545, + "kl": 0.08251953125, + "learning_rate": 1e-06, + "loss": -0.0199, + "num_tokens": 86274617.0, + "reward": 1.3437501192092896, + "reward_std": 0.2190425992012024, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.32543280720710754, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 446.3750305175781, + "completions/mean_terminated_length": 446.3750305175781, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.7717307196285788, + "grad_norm": 0.6120944023132324, + "kl": 0.076416015625, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 86394401.0, + "reward": 1.2375000715255737, + "reward_std": 0.1284002959728241, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23749998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3016097843647003, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1311.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 431.33038330078125, + "completions/mean_terminated_length": 431.33038330078125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.7727624451895796, + "grad_norm": 0.8004124760627747, + "kl": 0.0838623046875, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 86514502.0, + "reward": 1.3375000953674316, + "reward_std": 0.19743388891220093, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31138312816619873, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 432.4732360839844, + "completions/mean_terminated_length": 432.4732360839844, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.7737941707505803, + "grad_norm": 0.8266004323959351, + "kl": 0.08447265625, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 86630601.0, + "reward": 1.2843750715255737, + "reward_std": 0.21234013140201569, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3063907325267792, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 494.8035888671875, + "completions/mean_terminated_length": 494.8035888671875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.7748258963115812, + "grad_norm": 0.7730136513710022, + "kl": 0.0733642578125, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 86746198.0, + "reward": 1.2156250476837158, + "reward_std": 0.1886139214038849, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.26678189635276794, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 460.2500305175781, + "completions/mean_terminated_length": 460.2500305175781, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.7758576218725819, + "grad_norm": 0.7083492279052734, + "kl": 0.0791015625, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 86860942.0, + "reward": 1.2281250953674316, + "reward_std": 0.127069354057312, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.284650981426239, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 427.7589416503906, + "completions/mean_terminated_length": 427.7589416503906, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.7768893474335826, + "grad_norm": 0.8819575905799866, + "kl": 0.0924072265625, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 86976975.0, + "reward": 1.296875238418579, + "reward_std": 0.17696364223957062, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.29791173338890076, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 411.6339416503906, + "completions/mean_terminated_length": 411.6339416503906, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.7779210729945835, + "grad_norm": 0.6597311496734619, + "kl": 0.079345703125, + "learning_rate": 1e-06, + "loss": -0.0233, + "num_tokens": 87084587.0, + "reward": 1.3375002145767212, + "reward_std": 0.14886173605918884, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3005625307559967, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 441.8750305175781, + "completions/mean_terminated_length": 441.8750305175781, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.7789527985555842, + "grad_norm": 0.7229986786842346, + "kl": 0.08251953125, + "learning_rate": 1e-06, + "loss": -0.0129, + "num_tokens": 87201557.0, + "reward": 1.4125001430511475, + "reward_std": 0.18330040574073792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41249996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.30884116888046265, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 446.14288330078125, + "completions/mean_terminated_length": 446.14288330078125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.779984524116585, + "grad_norm": 0.6569157242774963, + "kl": 0.0789794921875, + "learning_rate": 1e-06, + "loss": 0.0141, + "num_tokens": 87312502.0, + "reward": 1.3375002145767212, + "reward_std": 0.1148693636059761, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3286266624927521, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 376.6071472167969, + "completions/mean_terminated_length": 376.6071472167969, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.7810162496775858, + "grad_norm": 0.583123505115509, + "kl": 0.0899658203125, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 87414224.0, + "reward": 1.4125001430511475, + "reward_std": 0.07358650118112564, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41249996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3295847773551941, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 426.9910888671875, + "completions/mean_terminated_length": 426.9910888671875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.7820479752385865, + "grad_norm": 0.8279862999916077, + "kl": 0.0753173828125, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 87525600.0, + "reward": 1.25, + "reward_std": 0.1890469193458557, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.302392840385437, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 975.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 459.33929443359375, + "completions/mean_terminated_length": 459.33929443359375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.7830797007995873, + "grad_norm": 0.5621715188026428, + "kl": 0.06854248046875, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 87639556.0, + "reward": 1.1500000953674316, + "reward_std": 0.11704766750335693, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14999999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.24732807278633118, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 384.9821472167969, + "completions/mean_terminated_length": 384.9821472167969, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.7841114263605881, + "grad_norm": 0.7570614218711853, + "kl": 0.0887451171875, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 87749804.0, + "reward": 1.2843750715255737, + "reward_std": 0.1683778166770935, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.330644428730011, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1262.0, + "completions/max_terminated_length": 1262.0, + "completions/mean_length": 477.77679443359375, + "completions/mean_terminated_length": 477.77679443359375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.7851431519215889, + "grad_norm": 0.643505871295929, + "kl": 0.0787353515625, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 87874260.0, + "reward": 1.2937501668930054, + "reward_std": 0.15550830960273743, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.32055166363716125, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 402.0982360839844, + "completions/mean_terminated_length": 402.0982360839844, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.7861748774825896, + "grad_norm": 0.7015209197998047, + "kl": 0.0675048828125, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 87979810.0, + "reward": 1.3062500953674316, + "reward_std": 0.1959277242422104, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.32591691613197327, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 364.08038330078125, + "completions/mean_terminated_length": 364.08038330078125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.7872066030435904, + "grad_norm": 0.7301487922668457, + "kl": 0.0853271484375, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 88080185.0, + "reward": 1.3781250715255737, + "reward_std": 0.15376612544059753, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.304972380399704, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 373.9196472167969, + "completions/mean_terminated_length": 373.9196472167969, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.7882383286045912, + "grad_norm": 0.8684947490692139, + "kl": 0.0743408203125, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 88180136.0, + "reward": 1.4125001430511475, + "reward_std": 0.14428050816059113, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4124999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.29420071840286255, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1064.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 463.02679443359375, + "completions/mean_terminated_length": 463.02679443359375, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.7892700541655919, + "grad_norm": 0.6717519760131836, + "kl": 0.0751953125, + "learning_rate": 1e-06, + "loss": -0.012, + "num_tokens": 88312908.0, + "reward": 1.2156251668930054, + "reward_std": 0.16076023876667023, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21562497317790985, + "rewards/curriculum_aware_reward_fn/std": 0.3017241060733795, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 456.77679443359375, + "completions/mean_terminated_length": 456.77679443359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.7903017797265928, + "grad_norm": 0.7640579342842102, + "kl": 0.0853271484375, + "learning_rate": 1e-06, + "loss": -0.0133, + "num_tokens": 88437665.0, + "reward": 1.25, + "reward_std": 0.18398447334766388, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2500000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2912384271621704, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 387.0982360839844, + "completions/mean_terminated_length": 387.0982360839844, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.7913335052875935, + "grad_norm": 0.7477678060531616, + "kl": 0.0906982421875, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 88554037.0, + "reward": 1.2625001668930054, + "reward_std": 0.14247135818004608, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26250001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.2989847958087921, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 370.9464416503906, + "completions/mean_terminated_length": 370.9464416503906, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.7923652308485942, + "grad_norm": 0.8266369104385376, + "kl": 0.086181640625, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 88661068.0, + "reward": 1.3660715818405151, + "reward_std": 0.18112213909626007, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.375, + "rewards/curriculum_aware_reward_fn/std": 0.3141555190086365, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1077.0, + "completions/max_terminated_length": 1077.0, + "completions/mean_length": 429.1071472167969, + "completions/mean_terminated_length": 429.1071472167969, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.7933969564095951, + "grad_norm": 0.7582897543907166, + "kl": 0.08447265625, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 88776657.0, + "reward": 1.2687500715255737, + "reward_std": 0.15974640846252441, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26874998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3080105483531952, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 389.01788330078125, + "completions/mean_terminated_length": 389.01788330078125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.7944286819705958, + "grad_norm": 0.6961264610290527, + "kl": 0.0784912109375, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 88886447.0, + "reward": 1.3468753099441528, + "reward_std": 0.13390925526618958, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3271692097187042, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 418.89288330078125, + "completions/mean_terminated_length": 418.89288330078125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.7954604075315966, + "grad_norm": 0.7252551317214966, + "kl": 0.0838623046875, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 89005617.0, + "reward": 1.2406251430511475, + "reward_std": 0.16366921365261078, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24062499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.31174683570861816, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 365.5982360839844, + "completions/mean_terminated_length": 365.5982360839844, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.7964921330925974, + "grad_norm": 0.727345883846283, + "kl": 0.099365234375, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 89113946.0, + "reward": 1.3500001430511475, + "reward_std": 0.15670651197433472, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34999996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31515759229660034, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1211.0, + "completions/max_terminated_length": 1211.0, + "completions/mean_length": 401.76788330078125, + "completions/mean_terminated_length": 401.76788330078125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.7975238586535981, + "grad_norm": 0.6628202795982361, + "kl": 0.0909423828125, + "learning_rate": 1e-06, + "loss": 0.0239, + "num_tokens": 89224758.0, + "reward": 1.3441965579986572, + "reward_std": 0.1218257024884224, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3062620460987091, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 452.419677734375, + "completions/mean_terminated_length": 452.419677734375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.7985555842145989, + "grad_norm": 0.8155584931373596, + "kl": 0.0916748046875, + "learning_rate": 1e-06, + "loss": -0.0088, + "num_tokens": 89346934.0, + "reward": 1.2562501430511475, + "reward_std": 0.1607874482870102, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25624996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.25724852085113525, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 439.27679443359375, + "completions/mean_terminated_length": 439.27679443359375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.7995873097755997, + "grad_norm": 0.5273690819740295, + "kl": 0.0804443359375, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 89466476.0, + "reward": 1.2250001430511475, + "reward_std": 0.0889258086681366, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.2890649735927582, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 430.2321472167969, + "completions/mean_terminated_length": 430.2321472167969, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.8006190353366005, + "grad_norm": 0.7398731708526611, + "kl": 0.0816650390625, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 89571390.0, + "reward": 1.25, + "reward_std": 0.1409335881471634, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 406.5446472167969, + "completions/mean_terminated_length": 406.5446472167969, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.8016507608976012, + "grad_norm": 0.686924934387207, + "kl": 0.0880126953125, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 89684039.0, + "reward": 1.3375002145767212, + "reward_std": 0.09972135722637177, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33750003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.2968680262565613, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 397.0357360839844, + "completions/mean_terminated_length": 397.0357360839844, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.802682486458602, + "grad_norm": 0.7994930744171143, + "kl": 0.0853271484375, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 89792753.0, + "reward": 1.3343751430511475, + "reward_std": 0.17851689457893372, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.29857251048088074, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 425.7232360839844, + "completions/mean_terminated_length": 425.7232360839844, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.8037142120196028, + "grad_norm": 0.7430455684661865, + "kl": 0.0855712890625, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 89909216.0, + "reward": 1.3218750953674316, + "reward_std": 0.1822567731142044, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.32933056354522705, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 462.3035888671875, + "completions/mean_terminated_length": 462.3035888671875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.8047459375806035, + "grad_norm": 0.7800207138061523, + "kl": 0.078125, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 90019633.0, + "reward": 1.2218750715255737, + "reward_std": 0.13174307346343994, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.27792516350746155, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1368.0, + "completions/max_terminated_length": 1368.0, + "completions/mean_length": 438.52679443359375, + "completions/mean_terminated_length": 438.52679443359375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.8057776631416044, + "grad_norm": 0.805196225643158, + "kl": 0.0794677734375, + "learning_rate": 1e-06, + "loss": -0.0377, + "num_tokens": 90136032.0, + "reward": 1.3218752145767212, + "reward_std": 0.2107643485069275, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.31212589144706726, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 434.64288330078125, + "completions/mean_terminated_length": 434.64288330078125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.8068093887026051, + "grad_norm": 0.8555618524551392, + "kl": 0.0823974609375, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 90252073.0, + "reward": 1.303125023841858, + "reward_std": 0.154591366648674, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.29896828532218933, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 454.0000305175781, + "completions/mean_terminated_length": 454.0000305175781, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.8078411142636058, + "grad_norm": 0.7412270307540894, + "kl": 0.0733642578125, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 90364171.0, + "reward": 1.1687500476837158, + "reward_std": 0.19139978289604187, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2738715708255768, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 429.7500305175781, + "completions/mean_terminated_length": 429.7500305175781, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.8088728398246067, + "grad_norm": 0.7853583693504333, + "kl": 0.0892333984375, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 90487896.0, + "reward": 1.2687500715255737, + "reward_std": 0.15851913392543793, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26875001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 440.2232360839844, + "completions/mean_terminated_length": 440.2232360839844, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.8099045653856074, + "grad_norm": 0.7697345614433289, + "kl": 0.0906982421875, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 90602819.0, + "reward": 1.1812500953674316, + "reward_std": 0.17448507249355316, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2738715410232544, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 474.6160888671875, + "completions/mean_terminated_length": 474.6160888671875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.8109362909466082, + "grad_norm": 0.6602099537849426, + "kl": 0.091796875, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 90723512.0, + "reward": 1.3375000953674316, + "reward_std": 0.1300942748785019, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30781853199005127, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 425.0446472167969, + "completions/mean_terminated_length": 425.0446472167969, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.811968016507609, + "grad_norm": 0.6979688405990601, + "kl": 0.0908203125, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 90836644.0, + "reward": 1.281250238418579, + "reward_std": 0.11639193445444107, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.31106650829315186, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 481.607177734375, + "completions/mean_terminated_length": 481.607177734375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.8129997420686097, + "grad_norm": 0.805659830570221, + "kl": 0.082763671875, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 90958932.0, + "reward": 1.2093751430511475, + "reward_std": 0.15934167802333832, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20937499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.26766687631607056, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 405.33038330078125, + "completions/mean_terminated_length": 405.33038330078125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.8140314676296105, + "grad_norm": 0.8081932067871094, + "kl": 0.0911865234375, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 91061057.0, + "reward": 1.328125, + "reward_std": 0.11970683932304382, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.328125, + "rewards/curriculum_aware_reward_fn/std": 0.2944517731666565, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 427.4910888671875, + "completions/mean_terminated_length": 427.4910888671875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.8150631931906113, + "grad_norm": 0.835981547832489, + "kl": 0.0826416015625, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 91178985.0, + "reward": 1.2937501668930054, + "reward_std": 0.14694808423519135, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29374998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3135904371738434, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 415.3750305175781, + "completions/mean_terminated_length": 415.3750305175781, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.8160949187516121, + "grad_norm": 0.7383970618247986, + "kl": 0.0889892578125, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 91285399.0, + "reward": 1.28125, + "reward_std": 0.16779515147209167, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.3283267021179199, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 487.0535888671875, + "completions/mean_terminated_length": 487.0535888671875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.8171266443126128, + "grad_norm": 0.683323860168457, + "kl": 0.0792236328125, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 91407149.0, + "reward": 1.2718751430511475, + "reward_std": 0.09385330229997635, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.2885019779205322, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 419.1071472167969, + "completions/mean_terminated_length": 419.1071472167969, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.8181583698736136, + "grad_norm": 0.7385226488113403, + "kl": 0.086181640625, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 91519343.0, + "reward": 1.3093751668930054, + "reward_std": 0.155110701918602, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3507733643054962, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 432.1875305175781, + "completions/mean_terminated_length": 432.1875305175781, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.8191900954346144, + "grad_norm": 0.8252224326133728, + "kl": 0.092041015625, + "learning_rate": 1e-06, + "loss": 0.0237, + "num_tokens": 91633702.0, + "reward": 1.2625001668930054, + "reward_std": 0.1600162237882614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26249998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.2989847660064697, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 438.1696472167969, + "completions/mean_terminated_length": 438.1696472167969, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.8202218209956151, + "grad_norm": 0.7697513103485107, + "kl": 0.0823974609375, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 91738098.0, + "reward": 1.328125, + "reward_std": 0.15455463528633118, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.328125, + "rewards/curriculum_aware_reward_fn/std": 0.3018546998500824, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 1126.0, + "completions/mean_length": 434.8035888671875, + "completions/mean_terminated_length": 434.8035888671875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.821253546556616, + "grad_norm": 0.7645787000656128, + "kl": 0.07861328125, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 91859407.0, + "reward": 1.3812501430511475, + "reward_std": 0.16925767064094543, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38124996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31005120277404785, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1202.0, + "completions/max_terminated_length": 1202.0, + "completions/mean_length": 453.232177734375, + "completions/mean_terminated_length": 453.232177734375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.8222852721176167, + "grad_norm": 0.7502590417861938, + "kl": 0.079833984375, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 91982269.0, + "reward": 1.250000238418579, + "reward_std": 0.09290025383234024, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.29872098565101624, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 460.58929443359375, + "completions/mean_terminated_length": 460.58929443359375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.8233169976786174, + "grad_norm": 0.7852086424827576, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 92100922.0, + "reward": 1.2281252145767212, + "reward_std": 0.1186632364988327, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2560775876045227, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 433.2946472167969, + "completions/mean_terminated_length": 433.2946472167969, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.8243487232396183, + "grad_norm": 0.7842056155204773, + "kl": 0.0833740234375, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 92218296.0, + "reward": 1.3156250715255737, + "reward_std": 0.2091911882162094, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.31501689553260803, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 470.8750305175781, + "completions/mean_terminated_length": 470.8750305175781, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.825380448800619, + "grad_norm": 0.8007763028144836, + "kl": 0.0863037109375, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 92344982.0, + "reward": 1.2312501668930054, + "reward_std": 0.166080504655838, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23125000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2596884071826935, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 463.669677734375, + "completions/mean_terminated_length": 463.669677734375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.8264121743616198, + "grad_norm": 0.6105215549468994, + "kl": 0.0792236328125, + "learning_rate": 1e-06, + "loss": 0.0339, + "num_tokens": 92462860.0, + "reward": 1.2156251668930054, + "reward_std": 0.14187639951705933, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2789161205291748, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1159.0, + "completions/max_terminated_length": 1159.0, + "completions/mean_length": 421.8660888671875, + "completions/mean_terminated_length": 421.8660888671875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.8274438999226206, + "grad_norm": 0.8359569311141968, + "kl": 0.082763671875, + "learning_rate": 1e-06, + "loss": 0.0386, + "num_tokens": 92576777.0, + "reward": 1.3843752145767212, + "reward_std": 0.1463547945022583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3843750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31501689553260803, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1227.0, + "completions/max_terminated_length": 1227.0, + "completions/mean_length": 462.044677734375, + "completions/mean_terminated_length": 462.044677734375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.8284756254836213, + "grad_norm": 0.6880461573600769, + "kl": 0.084716796875, + "learning_rate": 1e-06, + "loss": 0.0266, + "num_tokens": 92697434.0, + "reward": 1.3062502145767212, + "reward_std": 0.1431155502796173, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.32591691613197327, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 420.6339416503906, + "completions/mean_terminated_length": 420.6339416503906, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.8295073510446221, + "grad_norm": 0.7960591912269592, + "kl": 0.085205078125, + "learning_rate": 1e-06, + "loss": -0.0154, + "num_tokens": 92811724.0, + "reward": 1.3125, + "reward_std": 0.1750551015138626, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.33006277680397034, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 396.5357360839844, + "completions/mean_terminated_length": 396.5357360839844, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.8305390766056229, + "grad_norm": 0.8146962523460388, + "kl": 0.0845947265625, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 92921719.0, + "reward": 1.3562501668930054, + "reward_std": 0.1803237348794937, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.31509506702423096, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 422.3125305175781, + "completions/mean_terminated_length": 422.3125305175781, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.8315708021666237, + "grad_norm": 0.7674344778060913, + "kl": 0.0875244140625, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 93030527.0, + "reward": 1.2437500953674316, + "reward_std": 0.12594076991081238, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24375000596046448, + "rewards/curriculum_aware_reward_fn/std": 0.3074982464313507, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 422.2232360839844, + "completions/mean_terminated_length": 422.2232360839844, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.8326025277276244, + "grad_norm": 0.862937867641449, + "kl": 0.08349609375, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 93143588.0, + "reward": 1.3781250715255737, + "reward_std": 0.18341077864170074, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29764699935913086, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1708.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 466.4732360839844, + "completions/mean_terminated_length": 466.4732360839844, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.8336342532886252, + "grad_norm": 0.7200672626495361, + "kl": 0.0762939453125, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 93262186.0, + "reward": 1.2281252145767212, + "reward_std": 0.11673900485038757, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.27678829431533813, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 409.6160888671875, + "completions/mean_terminated_length": 409.6160888671875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.834665978849626, + "grad_norm": 0.8438058495521545, + "kl": 0.089599609375, + "learning_rate": 1e-06, + "loss": -0.0151, + "num_tokens": 93372204.0, + "reward": 1.2879464626312256, + "reward_std": 0.2107357233762741, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.30159345269203186, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 476.0535888671875, + "completions/mean_terminated_length": 476.0535888671875, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.8356977044106267, + "grad_norm": 0.7459707260131836, + "kl": 0.0853271484375, + "learning_rate": 1e-06, + "loss": 0.0241, + "num_tokens": 93507296.0, + "reward": 1.2125002145767212, + "reward_std": 0.18899433314800262, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.275378555059433, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 473.0357360839844, + "completions/mean_terminated_length": 473.0357360839844, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.8367294299716276, + "grad_norm": 0.7402332425117493, + "kl": 0.0797119140625, + "learning_rate": 1e-06, + "loss": -0.0096, + "num_tokens": 93627539.0, + "reward": 1.203125, + "reward_std": 0.11372973769903183, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.203125, + "rewards/curriculum_aware_reward_fn/std": 0.2804662585258484, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 416.5714416503906, + "completions/mean_terminated_length": 416.5714416503906, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.8377611555326283, + "grad_norm": 0.723020076751709, + "kl": 0.09765625, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 93743019.0, + "reward": 1.146875023841858, + "reward_std": 0.12751619517803192, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14687500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.23788492381572723, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 445.107177734375, + "completions/mean_terminated_length": 445.107177734375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.8387928810936292, + "grad_norm": 0.6011431813240051, + "kl": 0.0804443359375, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 93863389.0, + "reward": 1.171875, + "reward_std": 0.1036277711391449, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.27792516350746155, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1143.0, + "completions/max_terminated_length": 1143.0, + "completions/mean_length": 458.169677734375, + "completions/mean_terminated_length": 458.169677734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.8398246066546299, + "grad_norm": 0.7934638857841492, + "kl": 0.08935546875, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 93980077.0, + "reward": 1.359375238418579, + "reward_std": 0.13116928935050964, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.30613335967063904, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 401.9375305175781, + "completions/mean_terminated_length": 401.9375305175781, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.8408563322156306, + "grad_norm": 0.5980004668235779, + "kl": 0.0919189453125, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 94097467.0, + "reward": 1.2437502145767212, + "reward_std": 0.10399264842271805, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24374999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2773040235042572, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 403.6160888671875, + "completions/mean_terminated_length": 403.6160888671875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.8418880577766314, + "grad_norm": 0.721108078956604, + "kl": 0.094482421875, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 94199239.0, + "reward": 1.3343751430511475, + "reward_std": 0.14032889902591705, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3343749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.31998249888420105, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 431.70538330078125, + "completions/mean_terminated_length": 431.70538330078125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.8429197833376322, + "grad_norm": 0.7424548864364624, + "kl": 0.085205078125, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 94312984.0, + "reward": 1.2875001430511475, + "reward_std": 0.11729838699102402, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.29420071840286255, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 414.5357360839844, + "completions/mean_terminated_length": 414.5357360839844, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.843951508898633, + "grad_norm": 0.7833738327026367, + "kl": 0.087646484375, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 94425142.0, + "reward": 1.2000001668930054, + "reward_std": 0.16399544477462769, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20000000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.331254780292511, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 444.669677734375, + "completions/mean_terminated_length": 444.669677734375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.8449832344596337, + "grad_norm": 0.7755659222602844, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 94545204.0, + "reward": 1.2531250715255737, + "reward_std": 0.174255833029747, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.30882522463798523, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1086.0, + "completions/max_terminated_length": 1086.0, + "completions/mean_length": 421.6785888671875, + "completions/mean_terminated_length": 421.6785888671875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.8460149600206345, + "grad_norm": 0.7222782969474792, + "kl": 0.093017578125, + "learning_rate": 1e-06, + "loss": 0.0159, + "num_tokens": 94655831.0, + "reward": 1.4750001430511475, + "reward_std": 0.1430896818637848, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47499996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.28522157669067383, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 435.1607360839844, + "completions/mean_terminated_length": 435.1607360839844, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.8470466855816353, + "grad_norm": 0.8592573404312134, + "kl": 0.0877685546875, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 94771489.0, + "reward": 1.371875286102295, + "reward_std": 0.1934904009103775, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37187498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3018546998500824, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 419.0000305175781, + "completions/mean_terminated_length": 419.0000305175781, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.848078411142636, + "grad_norm": 0.71229088306427, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 94882873.0, + "reward": 1.4031251668930054, + "reward_std": 0.12293804436922073, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.30159345269203186, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 453.5089416503906, + "completions/mean_terminated_length": 453.5089416503906, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.8491101367036369, + "grad_norm": 0.7214085459709167, + "kl": 0.084716796875, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 95001867.0, + "reward": 1.3218752145767212, + "reward_std": 0.16592714190483093, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.32255885004997253, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 968.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 413.6607360839844, + "completions/mean_terminated_length": 413.6607360839844, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.8501418622646376, + "grad_norm": 0.8571239709854126, + "kl": 0.0865478515625, + "learning_rate": 1e-06, + "loss": 0.0314, + "num_tokens": 95109734.0, + "reward": 1.2562501430511475, + "reward_std": 0.18357297778129578, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2562499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.2933286130428314, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 392.1607360839844, + "completions/mean_terminated_length": 392.1607360839844, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.8511735878256383, + "grad_norm": 0.6368698477745056, + "kl": 0.0819091796875, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 95212813.0, + "reward": 1.2875001430511475, + "reward_std": 0.10738946497440338, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3228183686733246, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 403.5446472167969, + "completions/mean_terminated_length": 403.5446472167969, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.8522053133866392, + "grad_norm": 0.806452214717865, + "kl": 0.0963134765625, + "learning_rate": 1e-06, + "loss": 0.0234, + "num_tokens": 95314731.0, + "reward": 1.3656251430511475, + "reward_std": 0.13126225769519806, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3656249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.2910861074924469, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 421.5089416503906, + "completions/mean_terminated_length": 421.5089416503906, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.8532370389476399, + "grad_norm": 0.8154585361480713, + "kl": 0.08837890625, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 95432451.0, + "reward": 1.1441963911056519, + "reward_std": 0.18211181461811066, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.15312500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.24312911927700043, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 413.3214416503906, + "completions/mean_terminated_length": 413.3214416503906, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.8542687645086408, + "grad_norm": 0.8320736885070801, + "kl": 0.0955810546875, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 95549072.0, + "reward": 1.3093750476837158, + "reward_std": 0.13835106790065765, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3177575469017029, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 370.8125305175781, + "completions/mean_terminated_length": 370.8125305175781, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.8553004900696415, + "grad_norm": 0.8292173147201538, + "kl": 0.1004638671875, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 95652238.0, + "reward": 1.3906251192092896, + "reward_std": 0.1790168732404709, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.390625, + "rewards/curriculum_aware_reward_fn/std": 0.2998897135257721, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 447.2232360839844, + "completions/mean_terminated_length": 447.2232360839844, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8563322156306422, + "grad_norm": 0.7676867246627808, + "kl": 0.0826416015625, + "learning_rate": 1e-06, + "loss": -0.0195, + "num_tokens": 95772187.0, + "reward": 1.334375023841858, + "reward_std": 0.1484861969947815, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3094627261161804, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 390.76788330078125, + "completions/mean_terminated_length": 390.76788330078125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.857363941191643, + "grad_norm": 0.8843021988868713, + "kl": 0.09521484375, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 95882726.0, + "reward": 1.2906250953674316, + "reward_std": 0.1612338423728943, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 390.39288330078125, + "completions/mean_terminated_length": 390.39288330078125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.8583956667526438, + "grad_norm": 0.7326264977455139, + "kl": 0.09228515625, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 95989394.0, + "reward": 1.2250001430511475, + "reward_std": 0.11293934285640717, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.30030015110969543, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 385.40179443359375, + "completions/mean_terminated_length": 385.40179443359375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.8594273923136446, + "grad_norm": 0.9363760352134705, + "kl": 0.17724609375, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 96104561.0, + "reward": 1.3406251668930054, + "reward_std": 0.09900873154401779, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3097173273563385, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 398.76788330078125, + "completions/mean_terminated_length": 398.76788330078125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.8604591178746454, + "grad_norm": 0.5181918144226074, + "kl": 0.0869140625, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 96220532.0, + "reward": 1.328125238418579, + "reward_std": 0.07820750772953033, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.328125, + "rewards/curriculum_aware_reward_fn/std": 0.3331383168697357, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 387.26788330078125, + "completions/mean_terminated_length": 387.26788330078125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.8614908434356461, + "grad_norm": 0.5764657258987427, + "kl": 0.09765625, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 96327557.0, + "reward": 1.3468750715255737, + "reward_std": 0.0906006395816803, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.3371369540691376, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 387.6071472167969, + "completions/mean_terminated_length": 387.6071472167969, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.8625225689966469, + "grad_norm": 0.834621787071228, + "kl": 0.088623046875, + "learning_rate": 1e-06, + "loss": 0.0322, + "num_tokens": 96442887.0, + "reward": 1.3312500715255737, + "reward_std": 0.15943463146686554, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33125001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.32832667231559753, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 403.1250305175781, + "completions/mean_terminated_length": 403.1250305175781, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.8635542945576477, + "grad_norm": 0.7438127398490906, + "kl": 0.0938720703125, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 96552269.0, + "reward": 1.2535713911056519, + "reward_std": 0.12903402745723724, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.26249998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.2989847660064697, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 395.5625305175781, + "completions/mean_terminated_length": 395.5625305175781, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.8645860201186485, + "grad_norm": 0.799825131893158, + "kl": 0.088623046875, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 96662778.0, + "reward": 1.3250001668930054, + "reward_std": 0.1471077799797058, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3176490366458893, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1083.0, + "completions/max_terminated_length": 1083.0, + "completions/mean_length": 399.89288330078125, + "completions/mean_terminated_length": 399.89288330078125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.8656177456796492, + "grad_norm": 0.7156750559806824, + "kl": 0.08203125, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 96775018.0, + "reward": 1.3437501192092896, + "reward_std": 0.10255005210638046, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 386.9464416503906, + "completions/mean_terminated_length": 386.9464416503906, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.86664947124065, + "grad_norm": 0.7068809270858765, + "kl": 0.104736328125, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 96887159.0, + "reward": 1.312500238418579, + "reward_std": 0.15039849281311035, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.33338961005210876, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 395.02679443359375, + "completions/mean_terminated_length": 395.02679443359375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.8676811968016508, + "grad_norm": 0.6292682886123657, + "kl": 0.0875244140625, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 96993506.0, + "reward": 1.3656251430511475, + "reward_std": 0.09355327486991882, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3656249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.4079188406467438, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 374.5357360839844, + "completions/mean_terminated_length": 374.5357360839844, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.8687129223626515, + "grad_norm": 0.7815769910812378, + "kl": 0.096435546875, + "learning_rate": 1e-06, + "loss": 0.0273, + "num_tokens": 97096151.0, + "reward": 1.3125, + "reward_std": 0.12005040049552917, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.3057629466056824, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 356.83038330078125, + "completions/mean_terminated_length": 356.83038330078125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.8697446479236524, + "grad_norm": 0.9005613327026367, + "kl": 0.0909423828125, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 97200201.0, + "reward": 1.3375000953674316, + "reward_std": 0.1584685891866684, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30421215295791626, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 375.0714416503906, + "completions/mean_terminated_length": 375.0714416503906, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.8707763734846531, + "grad_norm": 0.7789735198020935, + "kl": 0.0931396484375, + "learning_rate": 1e-06, + "loss": -0.0192, + "num_tokens": 97310053.0, + "reward": 1.3406250476837158, + "reward_std": 0.1421245038509369, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34062498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.31326034665107727, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1432.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 381.8035888671875, + "completions/mean_terminated_length": 381.8035888671875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.8718080990456538, + "grad_norm": 0.8505178689956665, + "kl": 0.0975341796875, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 97419805.0, + "reward": 1.3937500715255737, + "reward_std": 0.1500549167394638, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.31907275319099426, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1939.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 362.9285888671875, + "completions/mean_terminated_length": 362.9285888671875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.8728398246066547, + "grad_norm": 0.6682746410369873, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 97517636.0, + "reward": 1.3687502145767212, + "reward_std": 0.09541250765323639, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3687500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.32832667231559753, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 399.77679443359375, + "completions/mean_terminated_length": 399.77679443359375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.8738715501676554, + "grad_norm": 0.6375681161880493, + "kl": 0.091552734375, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 97634091.0, + "reward": 1.3562501668930054, + "reward_std": 0.1318189799785614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3354521691799164, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 381.2410888671875, + "completions/mean_terminated_length": 381.2410888671875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.8749032757286562, + "grad_norm": 0.7441376447677612, + "kl": 0.10009765625, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 97744452.0, + "reward": 1.3468750715255737, + "reward_std": 0.1355905532836914, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2580.0, + "completions/max_terminated_length": 2580.0, + "completions/mean_length": 430.45538330078125, + "completions/mean_terminated_length": 430.45538330078125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.875935001289657, + "grad_norm": 0.693661630153656, + "kl": 0.0860595703125, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 97856888.0, + "reward": 1.3218752145767212, + "reward_std": 0.14773008227348328, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218749463558197, + "rewards/curriculum_aware_reward_fn/std": 0.29013675451278687, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 405.1250305175781, + "completions/mean_terminated_length": 405.1250305175781, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.8769667268506577, + "grad_norm": 0.7815752029418945, + "kl": 0.080078125, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 97973270.0, + "reward": 1.3500001430511475, + "reward_std": 0.15360049903392792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34999996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31515759229660034, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 383.20538330078125, + "completions/mean_terminated_length": 383.20538330078125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.8779984524116585, + "grad_norm": 0.7044087648391724, + "kl": 0.0887451171875, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 98082479.0, + "reward": 1.28125, + "reward_std": 0.1456676721572876, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28125, + "rewards/curriculum_aware_reward_fn/std": 0.2965359091758728, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 441.0000305175781, + "completions/mean_terminated_length": 441.0000305175781, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.8790301779726593, + "grad_norm": 0.8918853402137756, + "kl": 0.0916748046875, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 98202249.0, + "reward": 1.2906252145767212, + "reward_std": 0.1657445877790451, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.29297566413879395, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 381.0000305175781, + "completions/mean_terminated_length": 381.0000305175781, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.8800619035336601, + "grad_norm": 0.8854279518127441, + "kl": 0.0831298828125, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 98316314.0, + "reward": 1.2660716772079468, + "reward_std": 0.22139222919940948, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30602067708969116, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 411.7410888671875, + "completions/mean_terminated_length": 411.7410888671875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.8810936290946608, + "grad_norm": 0.8486577868461609, + "kl": 0.0880126953125, + "learning_rate": 1e-06, + "loss": -0.013, + "num_tokens": 98433899.0, + "reward": 1.3531252145767212, + "reward_std": 0.175072580575943, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.29896828532218933, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 402.1339416503906, + "completions/mean_terminated_length": 402.1339416503906, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.8821253546556616, + "grad_norm": 0.8045067191123962, + "kl": 0.0709228515625, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 98546075.0, + "reward": 1.3437501192092896, + "reward_std": 0.16631193459033966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 379.7410888671875, + "completions/mean_terminated_length": 379.7410888671875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.8831570802166624, + "grad_norm": 0.8452091813087463, + "kl": 0.0853271484375, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 98658711.0, + "reward": 1.3250001668930054, + "reward_std": 0.17389629781246185, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.30704930424690247, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 417.1696472167969, + "completions/mean_terminated_length": 417.1696472167969, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.8841888057776631, + "grad_norm": 0.83913254737854, + "kl": 0.0849609375, + "learning_rate": 1e-06, + "loss": -0.0166, + "num_tokens": 98775763.0, + "reward": 1.234375238418579, + "reward_std": 0.18759752810001373, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.234375, + "rewards/curriculum_aware_reward_fn/std": 0.267372190952301, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1142.0, + "completions/max_terminated_length": 1142.0, + "completions/mean_length": 358.6339416503906, + "completions/mean_terminated_length": 358.6339416503906, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.885220531338664, + "grad_norm": 0.6839672923088074, + "kl": 0.0914306640625, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 98871601.0, + "reward": 1.4375001192092896, + "reward_std": 0.09861234575510025, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4375, + "rewards/curriculum_aware_reward_fn/std": 0.31340184807777405, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 387.8750305175781, + "completions/mean_terminated_length": 387.8750305175781, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.8862522568996647, + "grad_norm": 0.7101348638534546, + "kl": 0.0872802734375, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 98979394.0, + "reward": 1.1968750953674316, + "reward_std": 0.13633039593696594, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.25204402208328247, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 367.7857360839844, + "completions/mean_terminated_length": 367.7857360839844, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.8872839824606654, + "grad_norm": 0.7461708188056946, + "kl": 0.0897216796875, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 99089417.0, + "reward": 1.3187501430511475, + "reward_std": 0.12355701625347137, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.32055166363716125, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 379.52679443359375, + "completions/mean_terminated_length": 379.52679443359375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.8883157080216663, + "grad_norm": 0.7257040739059448, + "kl": 0.090087890625, + "learning_rate": 1e-06, + "loss": -0.0127, + "num_tokens": 99192817.0, + "reward": 1.3656251430511475, + "reward_std": 0.14645807445049286, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36562496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31998246908187866, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 333.8571472167969, + "completions/mean_terminated_length": 333.8571472167969, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.889347433582667, + "grad_norm": 0.8632603883743286, + "kl": 0.104736328125, + "learning_rate": 1e-06, + "loss": 0.0267, + "num_tokens": 99286755.0, + "reward": 1.3781250715255737, + "reward_std": 0.1209789365530014, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3156418800354004, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 371.9375305175781, + "completions/mean_terminated_length": 371.9375305175781, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.8903791591436678, + "grad_norm": 0.804155170917511, + "kl": 0.1024169921875, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 99410704.0, + "reward": 1.28125, + "reward_std": 0.1209929883480072, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28125, + "rewards/curriculum_aware_reward_fn/std": 0.34472373127937317, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 418.89288330078125, + "completions/mean_terminated_length": 418.89288330078125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.8914108847046686, + "grad_norm": 0.6388071775436401, + "kl": 0.07958984375, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 99527053.0, + "reward": 1.2250001430511475, + "reward_std": 0.08316321671009064, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.29285791516304016, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 352.4732360839844, + "completions/mean_terminated_length": 352.4732360839844, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.8924426102656693, + "grad_norm": 0.7764028310775757, + "kl": 0.096923828125, + "learning_rate": 1e-06, + "loss": -0.0175, + "num_tokens": 99630426.0, + "reward": 1.3781250715255737, + "reward_std": 0.12856173515319824, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2704501748085022, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 360.95538330078125, + "completions/mean_terminated_length": 360.95538330078125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.8934743358266701, + "grad_norm": 0.8951486945152283, + "kl": 0.1124267578125, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 99738494.0, + "reward": 1.2281250953674316, + "reward_std": 0.16628824174404144, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2885020077228546, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 368.65179443359375, + "completions/mean_terminated_length": 368.65179443359375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.8945060613876709, + "grad_norm": 0.8461310267448425, + "kl": 0.0941162109375, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 99844967.0, + "reward": 1.3312500715255737, + "reward_std": 0.13890205323696136, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.28899678587913513, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 334.75, + "completions/mean_terminated_length": 334.75, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.8955377869486717, + "grad_norm": 0.9520079493522644, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 99947860.0, + "reward": 1.3500001430511475, + "reward_std": 0.1789051741361618, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.31163617968559265, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 367.9196472167969, + "completions/mean_terminated_length": 367.9196472167969, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.8965695125096724, + "grad_norm": 0.9064900279045105, + "kl": 0.10302734375, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 100056708.0, + "reward": 1.3218750953674316, + "reward_std": 0.17131094634532928, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218749463558197, + "rewards/curriculum_aware_reward_fn/std": 0.3878086507320404, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 334.40179443359375, + "completions/mean_terminated_length": 334.40179443359375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.8976012380706732, + "grad_norm": 1.0107978582382202, + "kl": 0.1094970703125, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 100164441.0, + "reward": 1.3843752145767212, + "reward_std": 0.218043714761734, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3843750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31850093603134155, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 358.64288330078125, + "completions/mean_terminated_length": 358.64288330078125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.898632963631674, + "grad_norm": 0.8547470569610596, + "kl": 0.1004638671875, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 100266412.0, + "reward": 1.3375000953674316, + "reward_std": 0.17571650445461273, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31839263439178467, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 330.8660888671875, + "completions/mean_terminated_length": 330.8660888671875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.8996646891926747, + "grad_norm": 0.8031458258628845, + "kl": 0.1195068359375, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 100361142.0, + "reward": 1.4312502145767212, + "reward_std": 0.12636539340019226, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4312500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.2933286130428314, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 397.1964416503906, + "completions/mean_terminated_length": 397.1964416503906, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.9006964147536756, + "grad_norm": 0.881301999092102, + "kl": 0.0902099609375, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 100477924.0, + "reward": 1.21875, + "reward_std": 0.171630397439003, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21875, + "rewards/curriculum_aware_reward_fn/std": 0.2938655614852905, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 369.45538330078125, + "completions/mean_terminated_length": 369.45538330078125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.9017281403146763, + "grad_norm": 0.660327672958374, + "kl": 0.0931396484375, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 100580651.0, + "reward": 1.3968751430511475, + "reward_std": 0.09679971635341644, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39687496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.30263715982437134, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 937.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 445.89288330078125, + "completions/mean_terminated_length": 445.89288330078125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.902759865875677, + "grad_norm": 0.8164187669754028, + "kl": 0.09228515625, + "learning_rate": 1e-06, + "loss": -0.0287, + "num_tokens": 100704889.0, + "reward": 1.2750000953674316, + "reward_std": 0.15713198482990265, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.32012102007865906, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 395.4821472167969, + "completions/mean_terminated_length": 395.4821472167969, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.9037915914366779, + "grad_norm": 0.891213059425354, + "kl": 0.0943603515625, + "learning_rate": 1e-06, + "loss": -0.012, + "num_tokens": 100811588.0, + "reward": 1.3531250953674316, + "reward_std": 0.2118910700082779, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3743632137775421, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 364.4464416503906, + "completions/mean_terminated_length": 364.4464416503906, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.9048233169976786, + "grad_norm": 0.8460397720336914, + "kl": 0.10400390625, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 100917464.0, + "reward": 1.2062500715255737, + "reward_std": 0.12978744506835938, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20624999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.27616459131240845, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 361.8035888671875, + "completions/mean_terminated_length": 361.8035888671875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.9058550425586794, + "grad_norm": 0.8688164353370667, + "kl": 0.1024169921875, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 101019390.0, + "reward": 1.375000238418579, + "reward_std": 0.1342623233795166, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.375, + "rewards/curriculum_aware_reward_fn/std": 0.3176490068435669, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1071.0, + "completions/max_terminated_length": 1071.0, + "completions/mean_length": 432.39288330078125, + "completions/mean_terminated_length": 432.39288330078125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.9068867681196802, + "grad_norm": 0.7971656322479248, + "kl": 0.083740234375, + "learning_rate": 1e-06, + "loss": 0.0371, + "num_tokens": 101136012.0, + "reward": 1.3156250715255737, + "reward_std": 0.14622807502746582, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.30067726969718933, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 415.0982360839844, + "completions/mean_terminated_length": 415.0982360839844, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.9079184936806809, + "grad_norm": 0.6433001160621643, + "kl": 0.08544921875, + "learning_rate": 1e-06, + "loss": -0.0238, + "num_tokens": 101242500.0, + "reward": 1.3406251668930054, + "reward_std": 0.09915289282798767, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.33040592074394226, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 427.8750305175781, + "completions/mean_terminated_length": 427.8750305175781, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.9089502192416817, + "grad_norm": 0.7412571907043457, + "kl": 0.0904541015625, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 101354430.0, + "reward": 1.28125, + "reward_std": 0.13072429597377777, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28125, + "rewards/curriculum_aware_reward_fn/std": 0.2965359091758728, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 390.58929443359375, + "completions/mean_terminated_length": 390.58929443359375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.9099819448026825, + "grad_norm": 0.9273052215576172, + "kl": 0.1094970703125, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 101464879.0, + "reward": 1.2660716772079468, + "reward_std": 0.15056000649929047, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 402.2946472167969, + "completions/mean_terminated_length": 402.2946472167969, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.9110136703636833, + "grad_norm": 0.8218299746513367, + "kl": 0.090087890625, + "learning_rate": 1e-06, + "loss": -0.0154, + "num_tokens": 101572901.0, + "reward": 1.2718751430511475, + "reward_std": 0.14793546497821808, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.284650981426239, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 398.83929443359375, + "completions/mean_terminated_length": 398.83929443359375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.912045395924684, + "grad_norm": 0.9065353274345398, + "kl": 0.1005859375, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 101675615.0, + "reward": 1.3062500953674316, + "reward_std": 0.19413165748119354, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062499463558197, + "rewards/curriculum_aware_reward_fn/std": 0.2900858223438263, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 421.6339416503906, + "completions/mean_terminated_length": 421.6339416503906, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.9130771214856848, + "grad_norm": 0.6250420212745667, + "kl": 0.0860595703125, + "learning_rate": 1e-06, + "loss": 0.0224, + "num_tokens": 101783689.0, + "reward": 1.3066965341567993, + "reward_std": 0.10317334532737732, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3253571093082428, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 431.5982360839844, + "completions/mean_terminated_length": 431.5982360839844, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.9141088470466856, + "grad_norm": 0.8888685703277588, + "kl": 0.1077880859375, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 101901874.0, + "reward": 1.3531252145767212, + "reward_std": 0.15322957932949066, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.29525384306907654, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 420.5714416503906, + "completions/mean_terminated_length": 420.5714416503906, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.9151405726076863, + "grad_norm": 0.7968472242355347, + "kl": 0.09375, + "learning_rate": 1e-06, + "loss": -0.0153, + "num_tokens": 102019160.0, + "reward": 1.3218750953674316, + "reward_std": 0.13992083072662354, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30133193731307983, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 429.77679443359375, + "completions/mean_terminated_length": 429.77679443359375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.9161722981686872, + "grad_norm": 0.6858448386192322, + "kl": 0.1043701171875, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 102129896.0, + "reward": 1.303125023841858, + "reward_std": 0.12142420560121536, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 464.20538330078125, + "completions/mean_terminated_length": 464.20538330078125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.9172040237296879, + "grad_norm": 0.6691279411315918, + "kl": 0.083984375, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 102244341.0, + "reward": 1.296875, + "reward_std": 0.13214033842086792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3123783469200134, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1458.0, + "completions/max_terminated_length": 1458.0, + "completions/mean_length": 528.2232666015625, + "completions/mean_terminated_length": 528.2232666015625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.9182357492906886, + "grad_norm": 0.8207437992095947, + "kl": 0.074462890625, + "learning_rate": 1e-06, + "loss": -0.0118, + "num_tokens": 102366047.0, + "reward": 1.1968750953674316, + "reward_std": 0.16041667759418488, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2849277853965759, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 457.3482360839844, + "completions/mean_terminated_length": 457.3482360839844, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.9192674748516895, + "grad_norm": 0.7058476805686951, + "kl": 0.1005859375, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 102486990.0, + "reward": 1.3250001668930054, + "reward_std": 0.14487729966640472, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.40609100461006165, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 471.90179443359375, + "completions/mean_terminated_length": 471.90179443359375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.9202992004126902, + "grad_norm": 0.6852114796638489, + "kl": 0.0927734375, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 102610319.0, + "reward": 1.2535713911056519, + "reward_std": 0.11259433627128601, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.26249998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.28383633494377136, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 482.857177734375, + "completions/mean_terminated_length": 482.857177734375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.921330925973691, + "grad_norm": 0.7629786133766174, + "kl": 0.086181640625, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 102730402.0, + "reward": 1.3129466772079468, + "reward_std": 0.2128395140171051, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.31911906599998474, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 489.0982360839844, + "completions/mean_terminated_length": 489.0982360839844, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.9223626515346918, + "grad_norm": 0.6978335380554199, + "kl": 0.08154296875, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 102848804.0, + "reward": 1.2946430444717407, + "reward_std": 0.21274369955062866, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.31987470388412476, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1120.0, + "completions/max_terminated_length": 1120.0, + "completions/mean_length": 521.3125, + "completions/mean_terminated_length": 521.3125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.9233943770956925, + "grad_norm": 0.7006912231445312, + "kl": 0.08349609375, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 102982035.0, + "reward": 1.240625023841858, + "reward_std": 0.13423092663288116, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24062497913837433, + "rewards/curriculum_aware_reward_fn/std": 0.2780669629573822, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1302.0, + "completions/max_terminated_length": 1302.0, + "completions/mean_length": 473.90179443359375, + "completions/mean_terminated_length": 473.90179443359375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.9244261026566933, + "grad_norm": 0.8263719081878662, + "kl": 0.0882568359375, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 103108528.0, + "reward": 1.3531252145767212, + "reward_std": 0.17819008231163025, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.2914920449256897, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 411.8125305175781, + "completions/mean_terminated_length": 411.8125305175781, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.9254578282176941, + "grad_norm": 0.5894196629524231, + "kl": 0.10205078125, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 103214179.0, + "reward": 1.4312502145767212, + "reward_std": 0.10276122391223907, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4312500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31509506702423096, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 440.5535888671875, + "completions/mean_terminated_length": 440.5535888671875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.9264895537786949, + "grad_norm": 0.6899087429046631, + "kl": 0.0850830078125, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 103324337.0, + "reward": 1.371875286102295, + "reward_std": 0.1472577452659607, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37187501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.29817622900009155, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1060.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 473.1607360839844, + "completions/mean_terminated_length": 473.1607360839844, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.9275212793396956, + "grad_norm": 0.6388729214668274, + "kl": 0.0850830078125, + "learning_rate": 1e-06, + "loss": -0.0178, + "num_tokens": 103442473.0, + "reward": 1.3156250715255737, + "reward_std": 0.13890205323696136, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.31501689553260803, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 504.4910888671875, + "completions/mean_terminated_length": 504.4910888671875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.9285530049006964, + "grad_norm": 0.651702880859375, + "kl": 0.092041015625, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 103582108.0, + "reward": 1.1218750476837158, + "reward_std": 0.14888589084148407, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.12187499552965164, + "rewards/curriculum_aware_reward_fn/std": 0.22388675808906555, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 508.1875305175781, + "completions/mean_terminated_length": 508.1875305175781, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.9295847304616972, + "grad_norm": 0.7340144515037537, + "kl": 0.093994140625, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 103709893.0, + "reward": 1.2625001668930054, + "reward_std": 0.1418541967868805, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26249998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.30627813935279846, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 478.5625305175781, + "completions/mean_terminated_length": 478.5625305175781, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.9306164560226979, + "grad_norm": 0.771500825881958, + "kl": 0.091796875, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 103828620.0, + "reward": 1.2879464626312256, + "reward_std": 0.17468391358852386, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3123783469200134, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 473.7857360839844, + "completions/mean_terminated_length": 473.7857360839844, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.9316481815836988, + "grad_norm": 0.7887445092201233, + "kl": 0.086181640625, + "learning_rate": 1e-06, + "loss": -0.0149, + "num_tokens": 103944649.0, + "reward": 1.312500238418579, + "reward_std": 0.17563453316688538, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.2947360873222351, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1001.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 458.6964416503906, + "completions/mean_terminated_length": 458.6964416503906, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.9326799071446995, + "grad_norm": 0.7262348532676697, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 104060647.0, + "reward": 1.3093751668930054, + "reward_std": 0.13768966495990753, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3035474121570587, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 448.0982360839844, + "completions/mean_terminated_length": 448.0982360839844, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.9337116327057002, + "grad_norm": 0.7300897240638733, + "kl": 0.10791015625, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 104170740.0, + "reward": 1.2750002145767212, + "reward_std": 0.13089033961296082, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3060206472873688, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 499.607177734375, + "completions/mean_terminated_length": 499.607177734375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.9347433582667011, + "grad_norm": 0.7067446708679199, + "kl": 0.0953369140625, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 104292174.0, + "reward": 1.296875, + "reward_std": 0.125174880027771, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.27877476811408997, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 499.76788330078125, + "completions/mean_terminated_length": 499.76788330078125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.9357750838277018, + "grad_norm": 0.7178418636322021, + "kl": 0.1014404296875, + "learning_rate": 1e-06, + "loss": -0.0112, + "num_tokens": 104416811.0, + "reward": 1.2875001430511475, + "reward_std": 0.13409018516540527, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3262191116809845, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 481.6339416503906, + "completions/mean_terminated_length": 481.6339416503906, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.9368068093887026, + "grad_norm": 0.7717330455780029, + "kl": 0.0947265625, + "learning_rate": 1e-06, + "loss": -0.0127, + "num_tokens": 104539340.0, + "reward": 1.2906252145767212, + "reward_std": 0.14821362495422363, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 508.4375305175781, + "completions/mean_terminated_length": 508.4375305175781, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.9378385349497034, + "grad_norm": 0.6931024789810181, + "kl": 0.0836181640625, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 104664367.0, + "reward": 1.3062502145767212, + "reward_std": 0.1397494077682495, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.2900857925415039, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 534.7053833007812, + "completions/mean_terminated_length": 534.7053833007812, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.9388702605107041, + "grad_norm": 0.640984296798706, + "kl": 0.0953369140625, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 104789816.0, + "reward": 1.3406251668930054, + "reward_std": 0.13892993330955505, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3887222707271576, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 466.21429443359375, + "completions/mean_terminated_length": 466.21429443359375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.9399019860717049, + "grad_norm": 0.6224113702774048, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 104908161.0, + "reward": 1.2973215579986572, + "reward_std": 0.1287800669670105, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.30625003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.3359218239784241, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 493.58038330078125, + "completions/mean_terminated_length": 493.58038330078125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.9409337116327057, + "grad_norm": 0.6648264527320862, + "kl": 0.0855712890625, + "learning_rate": 1e-06, + "loss": 0.0144, + "num_tokens": 105028752.0, + "reward": 1.2468750476837158, + "reward_std": 0.13067768514156342, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.30315765738487244, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 488.169677734375, + "completions/mean_terminated_length": 488.169677734375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.9419654371937065, + "grad_norm": 0.7831220626831055, + "kl": 0.0931396484375, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 105149780.0, + "reward": 1.275892972946167, + "reward_std": 0.2222750186920166, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3064711391925812, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1377.0, + "completions/max_terminated_length": 1377.0, + "completions/mean_length": 543.3035888671875, + "completions/mean_terminated_length": 543.3035888671875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.9429971627547072, + "grad_norm": 0.7976868748664856, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 105278362.0, + "reward": 1.2531250715255737, + "reward_std": 0.18378396332263947, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2747874855995178, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1282.0, + "completions/max_terminated_length": 1282.0, + "completions/mean_length": 565.607177734375, + "completions/mean_terminated_length": 565.607177734375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.944028888315708, + "grad_norm": 0.6149550080299377, + "kl": 0.0889892578125, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 105419774.0, + "reward": 1.2218750715255737, + "reward_std": 0.10345561802387238, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22187498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29711687564849854, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 491.8750305175781, + "completions/mean_terminated_length": 491.8750305175781, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.9450606138767088, + "grad_norm": 0.782224714756012, + "kl": 0.0875244140625, + "learning_rate": 1e-06, + "loss": -0.0176, + "num_tokens": 105543814.0, + "reward": 1.4281251430511475, + "reward_std": 0.1730240136384964, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4281249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.2923022508621216, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2439.0, + "completions/max_terminated_length": 2439.0, + "completions/mean_length": 503.6964416503906, + "completions/mean_terminated_length": 503.6964416503906, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.9460923394377095, + "grad_norm": 0.786839485168457, + "kl": 0.0928955078125, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 105672468.0, + "reward": 1.3031251430511475, + "reward_std": 0.1552998125553131, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3031249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.30263715982437134, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 515.0982666015625, + "completions/mean_terminated_length": 515.0982666015625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.9471240649987104, + "grad_norm": 0.7849997878074646, + "kl": 0.0869140625, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 105798969.0, + "reward": 1.2937501668930054, + "reward_std": 0.2153172492980957, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3064711391925812, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 458.0714416503906, + "completions/mean_terminated_length": 458.0714416503906, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.9481557905597111, + "grad_norm": 0.604444682598114, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 105912510.0, + "reward": 1.3781250715255737, + "reward_std": 0.11152077466249466, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.32596227526664734, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1160.0, + "completions/max_terminated_length": 1160.0, + "completions/mean_length": 500.7232360839844, + "completions/mean_terminated_length": 500.7232360839844, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.9491875161207118, + "grad_norm": 0.7500540018081665, + "kl": 0.097412109375, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 106034706.0, + "reward": 1.309375286102295, + "reward_std": 0.15969355404376984, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.29243704676628113, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 535.7678833007812, + "completions/mean_terminated_length": 535.7678833007812, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.9502192416817127, + "grad_norm": 0.8850662112236023, + "kl": 0.0933837890625, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 106158363.0, + "reward": 1.3375000953674316, + "reward_std": 0.11517629027366638, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33750003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.32525110244750977, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 513.9017944335938, + "completions/mean_terminated_length": 513.9017944335938, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.9512509672427134, + "grad_norm": 0.6601965427398682, + "kl": 0.088134765625, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 106284660.0, + "reward": 1.2781251668930054, + "reward_std": 0.16729682683944702, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3156418800354004, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 474.6250305175781, + "completions/mean_terminated_length": 474.6250305175781, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.9522826928037142, + "grad_norm": 0.8933461904525757, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": 0.0261, + "num_tokens": 106400034.0, + "reward": 1.3375002145767212, + "reward_std": 0.21545302867889404, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30781853199005127, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1058.0, + "completions/max_terminated_length": 1058.0, + "completions/mean_length": 517.5803833007812, + "completions/mean_terminated_length": 517.5803833007812, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.953314418364715, + "grad_norm": 0.5526024103164673, + "kl": 0.0885009765625, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 106525028.0, + "reward": 1.3531252145767212, + "reward_std": 0.09949037432670593, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 523.7767944335938, + "completions/mean_terminated_length": 523.7767944335938, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.9543461439257157, + "grad_norm": 0.7154967188835144, + "kl": 0.0843505859375, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 106653523.0, + "reward": 1.2660716772079468, + "reward_std": 0.21331168711185455, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.32355013489723206, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 449.64288330078125, + "completions/mean_terminated_length": 449.64288330078125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.9553778694867165, + "grad_norm": 0.6225499510765076, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 106772896.0, + "reward": 1.4000002145767212, + "reward_std": 0.10782329738140106, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4000000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31465697288513184, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 445.544677734375, + "completions/mean_terminated_length": 445.544677734375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.9564095950477173, + "grad_norm": 0.8484821915626526, + "kl": 0.095458984375, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 106885286.0, + "reward": 1.3437501192092896, + "reward_std": 0.17554441094398499, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.32543283700942993, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 463.8750305175781, + "completions/mean_terminated_length": 463.8750305175781, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.9574413206087181, + "grad_norm": 0.7585766911506653, + "kl": 0.0902099609375, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 107002613.0, + "reward": 1.3562501668930054, + "reward_std": 0.17320376634597778, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.30440643429756165, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 527.857177734375, + "completions/mean_terminated_length": 527.857177734375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.9584730461697188, + "grad_norm": 0.6367157101631165, + "kl": 0.0794677734375, + "learning_rate": 1e-06, + "loss": -0.0096, + "num_tokens": 107130872.0, + "reward": 1.2468751668930054, + "reward_std": 0.16819968819618225, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2920324206352234, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1474.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 537.3214721679688, + "completions/mean_terminated_length": 537.3214721679688, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.9595047717307196, + "grad_norm": 0.7050769329071045, + "kl": 0.088623046875, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 107258133.0, + "reward": 1.3343751430511475, + "reward_std": 0.19970746338367462, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.32680758833885193, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 512.4017944335938, + "completions/mean_terminated_length": 512.4017944335938, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.9605364972917204, + "grad_norm": 0.63140469789505, + "kl": 0.0830078125, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 107384787.0, + "reward": 1.1968750953674316, + "reward_std": 0.09665969759225845, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2963198721408844, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 490.1785888671875, + "completions/mean_terminated_length": 490.1785888671875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.9615682228527211, + "grad_norm": 0.7270257472991943, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 107502001.0, + "reward": 1.2723214626312256, + "reward_std": 0.1774478703737259, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.28125, + "rewards/curriculum_aware_reward_fn/std": 0.30023449659347534, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 521.5982666015625, + "completions/mean_terminated_length": 521.5982666015625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.962599948413722, + "grad_norm": 0.7357829213142395, + "kl": 0.1044921875, + "learning_rate": 1e-06, + "loss": 0.0343, + "num_tokens": 107634711.0, + "reward": 1.1973215341567993, + "reward_std": 0.1833607256412506, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.20624998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2840445339679718, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1322.0, + "completions/max_terminated_length": 1322.0, + "completions/mean_length": 479.90179443359375, + "completions/mean_terminated_length": 479.90179443359375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.9636316739747227, + "grad_norm": 0.6669960021972656, + "kl": 0.09765625, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 107757557.0, + "reward": 1.3968751430511475, + "reward_std": 0.14331825077533722, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39687496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.30263715982437134, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 458.20538330078125, + "completions/mean_terminated_length": 458.20538330078125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.9646633995357234, + "grad_norm": 0.5764197111129761, + "kl": 0.0992431640625, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 107869097.0, + "reward": 1.3312500715255737, + "reward_std": 0.10730495303869247, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3145943284034729, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2205.0, + "completions/max_terminated_length": 2205.0, + "completions/mean_length": 528.294677734375, + "completions/mean_terminated_length": 528.294677734375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.9656951250967243, + "grad_norm": 0.7136504650115967, + "kl": 0.0906982421875, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 107996614.0, + "reward": 1.171875, + "reward_std": 0.18532191216945648, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.2657456696033478, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 549.482177734375, + "completions/mean_terminated_length": 549.482177734375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.966726850657725, + "grad_norm": 0.7116082906723022, + "kl": 0.0865478515625, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 108123494.0, + "reward": 1.28125, + "reward_std": 0.16612868010997772, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28125, + "rewards/curriculum_aware_reward_fn/std": 0.31106650829315186, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 495.7232360839844, + "completions/mean_terminated_length": 495.7232360839844, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.9677585762187259, + "grad_norm": 0.760637640953064, + "kl": 0.0909423828125, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 108245816.0, + "reward": 1.328125, + "reward_std": 0.21447554230690002, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.328125, + "rewards/curriculum_aware_reward_fn/std": 0.3994241952896118, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1347.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 536.9375, + "completions/mean_terminated_length": 536.9375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.9687903017797266, + "grad_norm": 0.6102328300476074, + "kl": 0.087158203125, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 108379873.0, + "reward": 1.3187501430511475, + "reward_std": 0.1227864921092987, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3135904371738434, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 508.4107360839844, + "completions/mean_terminated_length": 508.4107360839844, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.9698220273407274, + "grad_norm": 0.6803673505783081, + "kl": 0.09033203125, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 108501726.0, + "reward": 1.3562501668930054, + "reward_std": 0.12399572134017944, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.31157293915748596, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1734.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 502.71429443359375, + "completions/mean_terminated_length": 502.71429443359375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.9708537529017282, + "grad_norm": 0.5645270943641663, + "kl": 0.099853515625, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 108627841.0, + "reward": 1.3000000715255737, + "reward_std": 0.0953846201300621, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3181449770927429, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 484.02679443359375, + "completions/mean_terminated_length": 484.02679443359375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.9718854784627289, + "grad_norm": 0.7351107597351074, + "kl": 0.100830078125, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 108751107.0, + "reward": 1.2406251430511475, + "reward_std": 0.13194237649440765, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24062500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.28972890973091125, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 473.5089416503906, + "completions/mean_terminated_length": 473.5089416503906, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.9729172040237297, + "grad_norm": 0.74190354347229, + "kl": 0.099853515625, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 108866685.0, + "reward": 1.2437502145767212, + "reward_std": 0.1013740599155426, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24374999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.3074982464313507, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 501.08038330078125, + "completions/mean_terminated_length": 501.08038330078125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.9739489295847304, + "grad_norm": 0.7810709476470947, + "kl": 0.1024169921875, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 108995851.0, + "reward": 1.2062500715255737, + "reward_std": 0.15930792689323425, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20624999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.2840445339679718, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 513.3125, + "completions/mean_terminated_length": 513.3125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.9749806551457313, + "grad_norm": 0.8026859760284424, + "kl": 0.093505859375, + "learning_rate": 1e-06, + "loss": 0.0276, + "num_tokens": 109120353.0, + "reward": 1.3375002145767212, + "reward_std": 0.1909678429365158, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31490740180015564, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 467.669677734375, + "completions/mean_terminated_length": 467.669677734375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.976012380706732, + "grad_norm": 0.6618792414665222, + "kl": 0.1016845703125, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 109237776.0, + "reward": 1.3125, + "reward_std": 0.12639063596725464, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.30935126543045044, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 487.83038330078125, + "completions/mean_terminated_length": 487.83038330078125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.9770441062677327, + "grad_norm": 0.810634195804596, + "kl": 0.09619140625, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 109348672.0, + "reward": 1.340625286102295, + "reward_std": 0.16616889834403992, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.316763699054718, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 480.8125305175781, + "completions/mean_terminated_length": 480.8125305175781, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.9780758318287336, + "grad_norm": 0.7666967511177063, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 109470493.0, + "reward": 1.3187501430511475, + "reward_std": 0.13620880246162415, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31874996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.295470654964447, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 490.2857360839844, + "completions/mean_terminated_length": 490.2857360839844, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.9791075573897343, + "grad_norm": 0.8252652287483215, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 109593396.0, + "reward": 1.3191965818405151, + "reward_std": 0.166405588388443, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.328125, + "rewards/curriculum_aware_reward_fn/std": 0.31614094972610474, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 455.5982360839844, + "completions/mean_terminated_length": 455.5982360839844, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.9801392829507352, + "grad_norm": 0.9774068593978882, + "kl": 0.107177734375, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 109720835.0, + "reward": 1.2093751430511475, + "reward_std": 0.22767837345600128, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20937499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.27578970789909363, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 465.95538330078125, + "completions/mean_terminated_length": 465.95538330078125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.9811710085117359, + "grad_norm": 0.7859709858894348, + "kl": 0.115966796875, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 109841398.0, + "reward": 1.2906252145767212, + "reward_std": 0.15378578007221222, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30406635999679565, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1094.0, + "completions/max_terminated_length": 1094.0, + "completions/mean_length": 465.419677734375, + "completions/mean_terminated_length": 465.419677734375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.9822027340727366, + "grad_norm": 0.7218555808067322, + "kl": 0.097412109375, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 109956700.0, + "reward": 1.3187501430511475, + "reward_std": 0.12293457984924316, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.295470654964447, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 536.8482666015625, + "completions/mean_terminated_length": 536.8482666015625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.9832344596337375, + "grad_norm": 0.6031023263931274, + "kl": 0.08154296875, + "learning_rate": 1e-06, + "loss": 0.0244, + "num_tokens": 110087401.0, + "reward": 1.1843751668930054, + "reward_std": 0.15871918201446533, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.18437497317790985, + "rewards/curriculum_aware_reward_fn/std": 0.27378159761428833, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 443.5089416503906, + "completions/mean_terminated_length": 443.5089416503906, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.9842661851947382, + "grad_norm": 0.5875775814056396, + "kl": 0.1033935546875, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 110196931.0, + "reward": 1.2504466772079468, + "reward_std": 0.07587282359600067, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2593750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.32121190428733826, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 454.5535888671875, + "completions/mean_terminated_length": 454.5535888671875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.985297910755739, + "grad_norm": 0.9105221033096313, + "kl": 0.100341796875, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 110320515.0, + "reward": 1.2937500476837158, + "reward_std": 0.21526940166950226, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3135904371738434, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 453.607177734375, + "completions/mean_terminated_length": 453.607177734375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.9863296363167398, + "grad_norm": 0.7779031991958618, + "kl": 0.107666015625, + "learning_rate": 1e-06, + "loss": 0.0369, + "num_tokens": 110435467.0, + "reward": 1.3187501430511475, + "reward_std": 0.15156292915344238, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3064711391925812, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 516.5803833007812, + "completions/mean_terminated_length": 516.5803833007812, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.9873613618777405, + "grad_norm": 0.8285189867019653, + "kl": 0.0853271484375, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 110568476.0, + "reward": 1.2718751430511475, + "reward_std": 0.19677434861660004, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.29975825548171997, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 474.0982360839844, + "completions/mean_terminated_length": 474.0982360839844, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.9883930874387413, + "grad_norm": 0.8431401252746582, + "kl": 0.1002197265625, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 110690413.0, + "reward": 1.3500001430511475, + "reward_std": 0.1829722374677658, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34999996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31515759229660034, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 449.0625305175781, + "completions/mean_terminated_length": 449.0625305175781, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.989424812999742, + "grad_norm": 0.7426992058753967, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 110798381.0, + "reward": 1.296875, + "reward_std": 0.12052858620882034, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3228031396865845, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1435.0, + "completions/max_terminated_length": 1435.0, + "completions/mean_length": 451.607177734375, + "completions/mean_terminated_length": 451.607177734375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.9904565385607429, + "grad_norm": 0.7545911073684692, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 110920053.0, + "reward": 1.3285715579986572, + "reward_std": 0.1768074631690979, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31839263439178467, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 466.8482360839844, + "completions/mean_terminated_length": 466.8482360839844, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.9914882641217436, + "grad_norm": 0.6909697651863098, + "kl": 0.09912109375, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 111045826.0, + "reward": 1.2718751430511475, + "reward_std": 0.10017166286706924, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.28850194811820984, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 434.5446472167969, + "completions/mean_terminated_length": 434.5446472167969, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.9925199896827444, + "grad_norm": 0.8224356770515442, + "kl": 0.1121826171875, + "learning_rate": 1e-06, + "loss": 0.0283, + "num_tokens": 111161142.0, + "reward": 1.3437501192092896, + "reward_std": 0.1928206980228424, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 444.0357360839844, + "completions/mean_terminated_length": 444.0357360839844, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.9935517152437452, + "grad_norm": 0.7404220700263977, + "kl": 0.0989990234375, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 111280391.0, + "reward": 1.2531250715255737, + "reward_std": 0.16437771916389465, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29791173338890076, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1311.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 490.732177734375, + "completions/mean_terminated_length": 490.732177734375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.9945834408047459, + "grad_norm": 0.6985900402069092, + "kl": 0.0899658203125, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 111401993.0, + "reward": 1.312500238418579, + "reward_std": 0.18294866383075714, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.3267020583152771, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 453.65179443359375, + "completions/mean_terminated_length": 453.65179443359375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.9956151663657468, + "grad_norm": 0.7975105047225952, + "kl": 0.1044921875, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 111515641.0, + "reward": 1.2843750715255737, + "reward_std": 0.11395810544490814, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.30997174978256226, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 473.71429443359375, + "completions/mean_terminated_length": 441.0810852050781, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.9966468919267475, + "grad_norm": 0.8458168506622314, + "kl": 0.1005859375, + "learning_rate": 1e-06, + "loss": 0.0504, + "num_tokens": 111637472.0, + "reward": 1.4004465341567993, + "reward_std": 0.22842104732990265, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.40937498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3147665560245514, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 471.33929443359375, + "completions/mean_terminated_length": 471.33929443359375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.9976786174877482, + "grad_norm": 0.7415000200271606, + "kl": 0.0968017578125, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 111761053.0, + "reward": 1.2250001430511475, + "reward_std": 0.16037410497665405, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.29660236835479736, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 454.669677734375, + "completions/mean_terminated_length": 454.669677734375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.9987103430487491, + "grad_norm": 0.7366511225700378, + "kl": 0.0972900390625, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 111882340.0, + "reward": 1.3093750476837158, + "reward_std": 0.0987308993935585, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.31073373556137085, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 385.7599792480469, + "completions/mean_terminated_length": 385.7599792480469, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.9997420686097498, + "grad_norm": 0.7283992171287537, + "kl": 0.1048583984375, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 112005959.0, + "reward": 1.3343751430511475, + "reward_std": 0.08949775248765945, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3058757483959198, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 431.3750305175781, + "completions/mean_terminated_length": 431.3750305175781, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 1.0010317255610008, + "grad_norm": 0.8628621101379395, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 112121459.0, + "reward": 1.2875001430511475, + "reward_std": 0.20198066532611847, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.29042527079582214, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 410.6785888671875, + "completions/mean_terminated_length": 410.6785888671875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 1.0020634511220015, + "grad_norm": 0.7795261144638062, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 112241060.0, + "reward": 1.3843750953674316, + "reward_std": 0.13455899059772491, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3843750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31501689553260803, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 373.27679443359375, + "completions/mean_terminated_length": 373.27679443359375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 1.0030951766830023, + "grad_norm": 0.7977780699729919, + "kl": 0.1087646484375, + "learning_rate": 1e-06, + "loss": -0.0185, + "num_tokens": 112337999.0, + "reward": 1.2875001430511475, + "reward_std": 0.19644302129745483, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3123941123485565, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 423.8482360839844, + "completions/mean_terminated_length": 423.8482360839844, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 1.0041269022440031, + "grad_norm": 0.7241541147232056, + "kl": 0.1025390625, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 112453216.0, + "reward": 1.3000000715255737, + "reward_std": 0.08505426347255707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29660236835479736, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 449.33929443359375, + "completions/mean_terminated_length": 449.33929443359375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.005158627805004, + "grad_norm": 0.6764619946479797, + "kl": 0.1170654296875, + "learning_rate": 1e-06, + "loss": -0.014, + "num_tokens": 112571680.0, + "reward": 1.3156250715255737, + "reward_std": 0.1194203794002533, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.32873159646987915, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 380.96429443359375, + "completions/mean_terminated_length": 380.96429443359375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 1.0061903533660046, + "grad_norm": 0.7338297963142395, + "kl": 0.1099853515625, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 112679644.0, + "reward": 1.4093750715255737, + "reward_std": 0.1461634635925293, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40937498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3147665560245514, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1149.0, + "completions/max_terminated_length": 1149.0, + "completions/mean_length": 400.46429443359375, + "completions/mean_terminated_length": 400.46429443359375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.0072220789270054, + "grad_norm": 0.7369305491447449, + "kl": 0.1041259765625, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 112792788.0, + "reward": 1.2906250953674316, + "reward_std": 0.13024835288524628, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 989.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 425.5089416503906, + "completions/mean_terminated_length": 425.5089416503906, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 1.0082538044880063, + "grad_norm": 0.6680691838264465, + "kl": 0.0927734375, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 112908002.0, + "reward": 1.231250286102295, + "reward_std": 0.11469794809818268, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23125000298023224, + "rewards/curriculum_aware_reward_fn/std": 0.31005123257637024, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 371.0446472167969, + "completions/mean_terminated_length": 371.0446472167969, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 1.009285530049007, + "grad_norm": 0.8868218064308167, + "kl": 0.1068115234375, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 113017130.0, + "reward": 1.4406250715255737, + "reward_std": 0.15701918303966522, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29243701696395874, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1042.0, + "completions/max_terminated_length": 1042.0, + "completions/mean_length": 428.21429443359375, + "completions/mean_terminated_length": 428.21429443359375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 1.0103172556100077, + "grad_norm": 0.9351738095283508, + "kl": 0.096435546875, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 113132682.0, + "reward": 1.2781251668930054, + "reward_std": 0.1593979001045227, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.2824268341064453, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 401.1696472167969, + "completions/mean_terminated_length": 401.1696472167969, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.0113489811710086, + "grad_norm": 0.6989129185676575, + "kl": 0.106201171875, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 113246479.0, + "reward": 1.1906250715255737, + "reward_std": 0.12484891712665558, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19062498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2853424549102783, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 375.8839416503906, + "completions/mean_terminated_length": 375.8839416503906, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 1.0123807067320092, + "grad_norm": 1.0648372173309326, + "kl": 0.1033935546875, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 113351592.0, + "reward": 1.3910716772079468, + "reward_std": 0.2463470697402954, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4000000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.39266932010650635, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 395.5357360839844, + "completions/mean_terminated_length": 395.5357360839844, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 1.01341243229301, + "grad_norm": 0.8360050320625305, + "kl": 0.1005859375, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 113455371.0, + "reward": 1.3562501668930054, + "reward_std": 0.16068443655967712, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3185782730579376, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 398.9910888671875, + "completions/mean_terminated_length": 398.9910888671875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 1.0144441578540109, + "grad_norm": 0.8212693929672241, + "kl": 0.107177734375, + "learning_rate": 1e-06, + "loss": -0.0121, + "num_tokens": 113563100.0, + "reward": 1.2875001430511475, + "reward_std": 0.16679814457893372, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.2979282736778259, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 427.5982360839844, + "completions/mean_terminated_length": 427.5982360839844, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.0154758834150117, + "grad_norm": 0.7585737705230713, + "kl": 0.0975341796875, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 113681530.0, + "reward": 1.25, + "reward_std": 0.12833638489246368, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 1355.0, + "completions/mean_length": 390.95538330078125, + "completions/mean_terminated_length": 390.95538330078125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 1.0165076089760123, + "grad_norm": 0.9035147428512573, + "kl": 0.100830078125, + "learning_rate": 1e-06, + "loss": 0.0315, + "num_tokens": 113783221.0, + "reward": 1.343750238418579, + "reward_std": 0.12016370892524719, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 403.9732360839844, + "completions/mean_terminated_length": 403.9732360839844, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.0175393345370132, + "grad_norm": 0.8435715436935425, + "kl": 0.0992431640625, + "learning_rate": 1e-06, + "loss": 0.0296, + "num_tokens": 113895435.0, + "reward": 1.2816966772079468, + "reward_std": 0.11472270637750626, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3182533085346222, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 405.1964416503906, + "completions/mean_terminated_length": 405.1964416503906, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 1.018571060098014, + "grad_norm": 0.8098063468933105, + "kl": 0.103759765625, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 114014128.0, + "reward": 1.2468751668930054, + "reward_std": 0.1147625669836998, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.2994951903820038, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 376.3482360839844, + "completions/mean_terminated_length": 376.3482360839844, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 1.0196027856590146, + "grad_norm": 0.7244241237640381, + "kl": 0.1068115234375, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 114123374.0, + "reward": 1.2906252145767212, + "reward_std": 0.10551070421934128, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 431.0357360839844, + "completions/mean_terminated_length": 431.0357360839844, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.0206345112200155, + "grad_norm": 0.8381537199020386, + "kl": 0.1102294921875, + "learning_rate": 1e-06, + "loss": -0.0137, + "num_tokens": 114237177.0, + "reward": 1.3343751430511475, + "reward_std": 0.11328289657831192, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3094627261161804, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 435.4910888671875, + "completions/mean_terminated_length": 435.4910888671875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 1.0216662367810163, + "grad_norm": 0.8157406449317932, + "kl": 0.101318359375, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 114354958.0, + "reward": 1.2875001430511475, + "reward_std": 0.16996197402477264, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3016097843647003, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 390.4910888671875, + "completions/mean_terminated_length": 390.4910888671875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 1.022697962342017, + "grad_norm": 0.8390117287635803, + "kl": 0.0965576171875, + "learning_rate": 1e-06, + "loss": 0.017, + "num_tokens": 114459852.0, + "reward": 1.2781251668930054, + "reward_std": 0.1394372582435608, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.31212589144706726, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 416.14288330078125, + "completions/mean_terminated_length": 416.14288330078125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.0237296879030178, + "grad_norm": 1.0235214233398438, + "kl": 0.0997314453125, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 114565302.0, + "reward": 1.4406250715255737, + "reward_std": 0.20136210322380066, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.28478938341140747, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 459.7857360839844, + "completions/mean_terminated_length": 459.7857360839844, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 1.0247614134640186, + "grad_norm": 0.8590009808540344, + "kl": 0.10009765625, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 114687589.0, + "reward": 1.2875001430511475, + "reward_std": 0.1746867150068283, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3228183686733246, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 406.4732360839844, + "completions/mean_terminated_length": 406.4732360839844, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 1.0257931390250195, + "grad_norm": 0.6487825512886047, + "kl": 0.0975341796875, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 114803793.0, + "reward": 1.3093751668930054, + "reward_std": 0.11713527143001556, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.33467283844947815, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 418.6875305175781, + "completions/mean_terminated_length": 418.6875305175781, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.02682486458602, + "grad_norm": 0.6680878400802612, + "kl": 0.096923828125, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 114917662.0, + "reward": 1.2125000953674316, + "reward_std": 0.10435570031404495, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2947360873222351, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 440.0714416503906, + "completions/mean_terminated_length": 440.0714416503906, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 1.027856590147021, + "grad_norm": 0.8054993152618408, + "kl": 0.094970703125, + "learning_rate": 1e-06, + "loss": 0.0191, + "num_tokens": 115030440.0, + "reward": 1.3593751192092896, + "reward_std": 0.13213443756103516, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.2988364100456238, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1444.0, + "completions/max_terminated_length": 1444.0, + "completions/mean_length": 443.6250305175781, + "completions/mean_terminated_length": 443.6250305175781, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 1.0288883157080218, + "grad_norm": 0.734188437461853, + "kl": 0.0919189453125, + "learning_rate": 1e-06, + "loss": -0.0174, + "num_tokens": 115150728.0, + "reward": 1.2937501668930054, + "reward_std": 0.1578095704317093, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.31005120277404785, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 461.1964416503906, + "completions/mean_terminated_length": 461.1964416503906, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 1.0299200412690224, + "grad_norm": 0.7558555603027344, + "kl": 0.0980224609375, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 115279081.0, + "reward": 1.3250000476837158, + "reward_std": 0.13827043771743774, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.30704930424690247, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 406.14288330078125, + "completions/mean_terminated_length": 406.14288330078125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 1.0309517668300232, + "grad_norm": 0.8551933169364929, + "kl": 0.0989990234375, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 115391874.0, + "reward": 1.3593751192092896, + "reward_std": 0.13182500004768372, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.3025068938732147, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 403.70538330078125, + "completions/mean_terminated_length": 403.70538330078125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 1.031983492391024, + "grad_norm": 0.8113073706626892, + "kl": 0.1077880859375, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 115502399.0, + "reward": 1.3312500715255737, + "reward_std": 0.15864074230194092, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.32153382897377014, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 1075.0, + "completions/mean_length": 444.607177734375, + "completions/mean_terminated_length": 444.607177734375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 1.0330152179520247, + "grad_norm": 0.778337299823761, + "kl": 0.099609375, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 115610512.0, + "reward": 1.265625238418579, + "reward_std": 0.1536625176668167, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.265625, + "rewards/curriculum_aware_reward_fn/std": 0.2867204546928406, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 426.5000305175781, + "completions/mean_terminated_length": 426.5000305175781, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.0340469435130255, + "grad_norm": 0.7531226873397827, + "kl": 0.1019287109375, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 115723513.0, + "reward": 1.390625238418579, + "reward_std": 0.13248291611671448, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.390625, + "rewards/curriculum_aware_reward_fn/std": 0.28478941321372986, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 430.8214416503906, + "completions/mean_terminated_length": 430.8214416503906, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.0350786690740263, + "grad_norm": 0.8895756006240845, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 115835367.0, + "reward": 1.1968750953674316, + "reward_std": 0.16403761506080627, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2925718128681183, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 489.96429443359375, + "completions/mean_terminated_length": 489.96429443359375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.0361103946350272, + "grad_norm": 0.611074686050415, + "kl": 0.085693359375, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 115972710.0, + "reward": 1.1343750953674316, + "reward_std": 0.10016217827796936, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.13437499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2061040699481964, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 413.2500305175781, + "completions/mean_terminated_length": 413.2500305175781, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.0371421201960278, + "grad_norm": 0.7905242443084717, + "kl": 0.0982666015625, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 116086103.0, + "reward": 1.2718751430511475, + "reward_std": 0.15575583279132843, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27187496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31060686707496643, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 424.6696472167969, + "completions/mean_terminated_length": 424.6696472167969, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 1.0381738457570286, + "grad_norm": 0.92307049036026, + "kl": 0.11328125, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 116206452.0, + "reward": 1.3062502145767212, + "reward_std": 0.19037002325057983, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3791363537311554, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 427.5714416503906, + "completions/mean_terminated_length": 427.5714416503906, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 1.0392055713180295, + "grad_norm": 0.6816158294677734, + "kl": 0.1048583984375, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 116319797.0, + "reward": 1.3375002145767212, + "reward_std": 0.1783868372440338, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.37268805503845215, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 421.8839416503906, + "completions/mean_terminated_length": 421.8839416503906, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.04023729687903, + "grad_norm": 0.7737622857093811, + "kl": 0.1016845703125, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 116438546.0, + "reward": 1.28125, + "reward_std": 0.14772114157676697, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.3180830180644989, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 402.9910888671875, + "completions/mean_terminated_length": 402.9910888671875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.041269022440031, + "grad_norm": 0.7403515577316284, + "kl": 0.1068115234375, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 116546933.0, + "reward": 1.2687500715255737, + "reward_std": 0.08907577395439148, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26874998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2895418107509613, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 407.7232360839844, + "completions/mean_terminated_length": 407.7232360839844, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 1.0423007480010318, + "grad_norm": 0.7900381684303284, + "kl": 0.1107177734375, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 116658636.0, + "reward": 1.3625000715255737, + "reward_std": 0.12463202327489853, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3078185021877289, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 424.1785888671875, + "completions/mean_terminated_length": 424.1785888671875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.0433324735620324, + "grad_norm": 0.6202005743980408, + "kl": 0.097412109375, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 116775815.0, + "reward": 1.3500001430511475, + "reward_std": 0.09745357185602188, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.31515759229660034, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 428.9285888671875, + "completions/mean_terminated_length": 428.9285888671875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.0443641991230332, + "grad_norm": 0.9668126106262207, + "kl": 0.099365234375, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 116894856.0, + "reward": 1.25, + "reward_std": 0.1951034516096115, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.3060206472873688, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 440.3839416503906, + "completions/mean_terminated_length": 440.3839416503906, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.045395924684034, + "grad_norm": 0.7224416732788086, + "kl": 0.0953369140625, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 116999829.0, + "reward": 1.390625238418579, + "reward_std": 0.13267162442207336, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.390625, + "rewards/curriculum_aware_reward_fn/std": 0.2729164659976959, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 456.1964416503906, + "completions/mean_terminated_length": 456.1964416503906, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 1.046427650245035, + "grad_norm": 0.6741520166397095, + "kl": 0.0992431640625, + "learning_rate": 1e-06, + "loss": 0.0188, + "num_tokens": 117116037.0, + "reward": 1.2843750715255737, + "reward_std": 0.10286222398281097, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.30997174978256226, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 427.83929443359375, + "completions/mean_terminated_length": 427.83929443359375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 1.0474593758060355, + "grad_norm": 0.8775485157966614, + "kl": 0.1038818359375, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 117224638.0, + "reward": 1.3625000715255737, + "reward_std": 0.19390401244163513, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.31839263439178467, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3500.0, + "completions/max_terminated_length": 3500.0, + "completions/mean_length": 471.83038330078125, + "completions/mean_terminated_length": 471.83038330078125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 1.0484911013670364, + "grad_norm": 0.8173244595527649, + "kl": 0.095458984375, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 117345691.0, + "reward": 1.3250001668930054, + "reward_std": 0.1681494414806366, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3345697820186615, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 437.33038330078125, + "completions/mean_terminated_length": 437.33038330078125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 1.0495228269280372, + "grad_norm": 0.6973631381988525, + "kl": 0.0870361328125, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 117458253.0, + "reward": 1.3625000715255737, + "reward_std": 0.1287010908126831, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.31490740180015564, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 437.8035888671875, + "completions/mean_terminated_length": 437.8035888671875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.0505545524890378, + "grad_norm": 0.7553476691246033, + "kl": 0.1016845703125, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 117575376.0, + "reward": 1.3062500953674316, + "reward_std": 0.16072815656661987, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31559503078460693, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1148.0, + "completions/max_terminated_length": 1148.0, + "completions/mean_length": 496.02679443359375, + "completions/mean_terminated_length": 496.02679443359375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 1.0515862780500387, + "grad_norm": 0.6043642163276672, + "kl": 0.08447265625, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 117692713.0, + "reward": 1.3406251668930054, + "reward_std": 0.1272299736738205, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.31326034665107727, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 450.02679443359375, + "completions/mean_terminated_length": 450.02679443359375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.0526180036110395, + "grad_norm": 0.71767258644104, + "kl": 0.095703125, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 117816700.0, + "reward": 1.2468751668930054, + "reward_std": 0.08287324756383896, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24687497317790985, + "rewards/curriculum_aware_reward_fn/std": 0.30677640438079834, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1677.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 450.3125305175781, + "completions/mean_terminated_length": 450.3125305175781, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 1.0536497291720401, + "grad_norm": 0.7819437384605408, + "kl": 0.1041259765625, + "learning_rate": 1e-06, + "loss": -0.0219, + "num_tokens": 117928721.0, + "reward": 1.3156250715255737, + "reward_std": 0.15827159583568573, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3114938735961914, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 427.4910888671875, + "completions/mean_terminated_length": 427.4910888671875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 1.054681454733041, + "grad_norm": 0.8454352617263794, + "kl": 0.094970703125, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 118039684.0, + "reward": 1.3250001668930054, + "reward_std": 0.14444464445114136, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3141555190086365, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 440.70538330078125, + "completions/mean_terminated_length": 440.70538330078125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 1.0557131802940418, + "grad_norm": 0.764153242111206, + "kl": 0.087646484375, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 118149803.0, + "reward": 1.4250000715255737, + "reward_std": 0.15279839932918549, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.31315022706985474, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 488.0357360839844, + "completions/mean_terminated_length": 488.0357360839844, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 1.0567449058550427, + "grad_norm": 0.7056334018707275, + "kl": 0.0885009765625, + "learning_rate": 1e-06, + "loss": 0.0292, + "num_tokens": 118285908.0, + "reward": 1.2906252145767212, + "reward_std": 0.14304517209529877, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.2853424549102783, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 430.5714416503906, + "completions/mean_terminated_length": 430.5714416503906, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.0577766314160433, + "grad_norm": 0.797414243221283, + "kl": 0.096923828125, + "learning_rate": 1e-06, + "loss": 0.0307, + "num_tokens": 118399351.0, + "reward": 1.3875001668930054, + "reward_std": 0.16624128818511963, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38749998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.30935123562812805, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 426.5982360839844, + "completions/mean_terminated_length": 426.5982360839844, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 1.0588083569770441, + "grad_norm": 0.750198483467102, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 118510515.0, + "reward": 1.3562500476837158, + "reward_std": 0.14397019147872925, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3080105185508728, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 463.20538330078125, + "completions/mean_terminated_length": 463.20538330078125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.059840082538045, + "grad_norm": 0.7545668482780457, + "kl": 0.09130859375, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 118621383.0, + "reward": 1.28125, + "reward_std": 0.14572392404079437, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28125, + "rewards/curriculum_aware_reward_fn/std": 0.3038880527019501, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 391.6875305175781, + "completions/mean_terminated_length": 391.6875305175781, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 1.0608718080990456, + "grad_norm": 0.717291533946991, + "kl": 0.0943603515625, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 118734657.0, + "reward": 1.3031251430511475, + "reward_std": 0.13526920974254608, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3031249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.2914920449256897, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 413.7321472167969, + "completions/mean_terminated_length": 413.7321472167969, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.0619035336600464, + "grad_norm": 0.7581539750099182, + "kl": 0.0992431640625, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 118846298.0, + "reward": 1.3906251192092896, + "reward_std": 0.13097161054611206, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.390625, + "rewards/curriculum_aware_reward_fn/std": 0.3280114233493805, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1135.0, + "completions/max_terminated_length": 1135.0, + "completions/mean_length": 493.58038330078125, + "completions/mean_terminated_length": 493.58038330078125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 1.0629352592210473, + "grad_norm": 0.6710352897644043, + "kl": 0.0892333984375, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 118974442.0, + "reward": 1.2312501668930054, + "reward_std": 0.13597580790519714, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23124997317790985, + "rewards/curriculum_aware_reward_fn/std": 0.3028486967086792, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 426.7857360839844, + "completions/mean_terminated_length": 426.7857360839844, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.0639669847820479, + "grad_norm": 0.6972105503082275, + "kl": 0.0975341796875, + "learning_rate": 1e-06, + "loss": -0.0191, + "num_tokens": 119095896.0, + "reward": 1.3250001668930054, + "reward_std": 0.09518764913082123, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.29231905937194824, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1136.0, + "completions/max_terminated_length": 1136.0, + "completions/mean_length": 487.7589416503906, + "completions/mean_terminated_length": 487.7589416503906, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 1.0649987103430487, + "grad_norm": 0.8523579835891724, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 119217277.0, + "reward": 1.234375, + "reward_std": 0.16692785918712616, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2343749850988388, + "rewards/curriculum_aware_reward_fn/std": 0.2714684009552002, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 411.0625305175781, + "completions/mean_terminated_length": 411.0625305175781, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.0660304359040496, + "grad_norm": 0.7746363282203674, + "kl": 0.0946044921875, + "learning_rate": 1e-06, + "loss": -0.0123, + "num_tokens": 119329711.0, + "reward": 1.347321629524231, + "reward_std": 0.17235055565834045, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.31509506702423096, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 443.4464416503906, + "completions/mean_terminated_length": 443.4464416503906, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 1.0670621614650504, + "grad_norm": 0.7955756783485413, + "kl": 0.0926513671875, + "learning_rate": 1e-06, + "loss": -0.0177, + "num_tokens": 119444519.0, + "reward": 1.3562501668930054, + "reward_std": 0.16780826449394226, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.32202377915382385, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 464.6160888671875, + "completions/mean_terminated_length": 464.6160888671875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 1.068093887026051, + "grad_norm": 0.7193375825881958, + "kl": 0.091796875, + "learning_rate": 1e-06, + "loss": 0.0262, + "num_tokens": 119563924.0, + "reward": 1.2843750715255737, + "reward_std": 0.13995222747325897, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29538729786872864, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1595.0, + "completions/max_terminated_length": 1595.0, + "completions/mean_length": 491.5625305175781, + "completions/mean_terminated_length": 491.5625305175781, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.0691256125870519, + "grad_norm": 0.6852903366088867, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 119688482.0, + "reward": 1.3062500953674316, + "reward_std": 0.11520179361104965, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.2862561047077179, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 468.9464416503906, + "completions/mean_terminated_length": 468.9464416503906, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.0701573381480527, + "grad_norm": 0.6984896063804626, + "kl": 0.0897216796875, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 119805411.0, + "reward": 1.3125, + "reward_std": 0.17140275239944458, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.3267020583152771, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 458.4285888671875, + "completions/mean_terminated_length": 458.4285888671875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 1.0711890637090533, + "grad_norm": 0.7449373006820679, + "kl": 0.090576171875, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 119919573.0, + "reward": 1.4468752145767212, + "reward_std": 0.17173954844474792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4468750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.36756327748298645, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 480.46429443359375, + "completions/mean_terminated_length": 480.46429443359375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.0722207892700542, + "grad_norm": 0.6695937514305115, + "kl": 0.0924072265625, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 120043756.0, + "reward": 1.3093751668930054, + "reward_std": 0.13723015785217285, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3071615993976593, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 493.4285888671875, + "completions/mean_terminated_length": 493.4285888671875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 1.073252514831055, + "grad_norm": 0.6117882132530212, + "kl": 0.1026611328125, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 120172303.0, + "reward": 1.1937501430511475, + "reward_std": 0.1261538565158844, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19375000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.28515249490737915, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 458.9107360839844, + "completions/mean_terminated_length": 458.9107360839844, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 1.0742842403920556, + "grad_norm": 0.619491696357727, + "kl": 0.0947265625, + "learning_rate": 1e-06, + "loss": -0.0123, + "num_tokens": 120282844.0, + "reward": 1.3156250715255737, + "reward_std": 0.11784723401069641, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3386533558368683, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1079.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 520.6964721679688, + "completions/mean_terminated_length": 520.6964721679688, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 1.0753159659530565, + "grad_norm": 0.47998204827308655, + "kl": 0.0845947265625, + "learning_rate": 1e-06, + "loss": 0.009, + "num_tokens": 120414977.0, + "reward": 1.1937501430511475, + "reward_std": 0.08792105317115784, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19375000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.2773040235042572, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 462.5089416503906, + "completions/mean_terminated_length": 462.5089416503906, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 1.0763476915140573, + "grad_norm": 62.07821273803711, + "kl": 10.761962890625, + "learning_rate": 1e-06, + "loss": 0.1105, + "num_tokens": 120526782.0, + "reward": 1.303125023841858, + "reward_std": 0.11684229224920273, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31338611245155334, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 488.7500305175781, + "completions/mean_terminated_length": 488.7500305175781, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.0773794170750581, + "grad_norm": 0.7104902267456055, + "kl": 0.0926513671875, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 120650962.0, + "reward": 1.2750000953674316, + "reward_std": 0.16270779073238373, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.2987210154533386, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 468.4464416503906, + "completions/mean_terminated_length": 468.4464416503906, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.0784111426360588, + "grad_norm": 0.7016794681549072, + "kl": 0.0914306640625, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 120775003.0, + "reward": 1.3812501430511475, + "reward_std": 0.12865351140499115, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3812499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.27616459131240845, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 445.982177734375, + "completions/mean_terminated_length": 445.982177734375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 1.0794428681970596, + "grad_norm": 0.8315629363059998, + "kl": 0.104736328125, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 120900882.0, + "reward": 1.3031251430511475, + "reward_std": 0.15639452636241913, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31688809394836426, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 458.26788330078125, + "completions/mean_terminated_length": 458.26788330078125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 1.0804745937580604, + "grad_norm": 0.672211766242981, + "kl": 0.09765625, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 121022026.0, + "reward": 1.3250001668930054, + "reward_std": 0.12497055530548096, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.288519024848938, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 449.58929443359375, + "completions/mean_terminated_length": 449.58929443359375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 1.081506319319061, + "grad_norm": 0.7990990877151489, + "kl": 0.092529296875, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 121135310.0, + "reward": 1.3531252145767212, + "reward_std": 0.14558075368404388, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.30263715982437134, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 457.27679443359375, + "completions/mean_terminated_length": 457.27679443359375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 1.082538044880062, + "grad_norm": 0.8014538288116455, + "kl": 0.0947265625, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 121256840.0, + "reward": 1.3500001430511475, + "reward_std": 0.1600000113248825, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34999996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.2933957576751709, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 438.5625305175781, + "completions/mean_terminated_length": 438.5625305175781, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 1.0835697704410627, + "grad_norm": 0.8739475607872009, + "kl": 0.1004638671875, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 121374825.0, + "reward": 1.4937502145767212, + "reward_std": 0.18104833364486694, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4937499463558197, + "rewards/curriculum_aware_reward_fn/std": 0.27616459131240845, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 457.0982360839844, + "completions/mean_terminated_length": 457.0982360839844, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 1.0846014960020633, + "grad_norm": 0.7015742659568787, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 121490084.0, + "reward": 1.4437501430511475, + "reward_std": 0.13059858977794647, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44374996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31509506702423096, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1236.0, + "completions/max_terminated_length": 1236.0, + "completions/mean_length": 550.5535888671875, + "completions/mean_terminated_length": 550.5535888671875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 1.0856332215630642, + "grad_norm": 0.6628881692886353, + "kl": 0.0919189453125, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 121630804.0, + "reward": 1.262946605682373, + "reward_std": 0.11178959906101227, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.31413981318473816, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 466.1160888671875, + "completions/mean_terminated_length": 466.1160888671875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 1.086664947124065, + "grad_norm": 0.8220956921577454, + "kl": 0.10546875, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 121751539.0, + "reward": 1.2598215341567993, + "reward_std": 0.2078828513622284, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.26874998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.30440643429756165, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1585.0, + "completions/max_terminated_length": 1585.0, + "completions/mean_length": 473.20538330078125, + "completions/mean_terminated_length": 473.20538330078125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 1.0876966726850659, + "grad_norm": 0.8058013319969177, + "kl": 0.100830078125, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 121871437.0, + "reward": 1.3035714626312256, + "reward_std": 0.18695953488349915, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.28714969754219055, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 423.8214416503906, + "completions/mean_terminated_length": 423.8214416503906, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 1.0887283982460665, + "grad_norm": 0.7489458322525024, + "kl": 0.1041259765625, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 121995744.0, + "reward": 1.4004465341567993, + "reward_std": 0.16618023812770844, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.40937498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 549.3928833007812, + "completions/mean_terminated_length": 549.3928833007812, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 1.0897601238070673, + "grad_norm": 0.5715673565864563, + "kl": 0.090087890625, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 122131665.0, + "reward": 1.2281250953674316, + "reward_std": 0.09870258718729019, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.27678829431533813, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 477.794677734375, + "completions/mean_terminated_length": 477.794677734375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 1.0907918493680682, + "grad_norm": 0.6655464172363281, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 122243201.0, + "reward": 1.2517858743667603, + "reward_std": 0.17265519499778748, + "rewards/code_format_reward/mean": 0.9642857313156128, + "rewards/code_format_reward/std": 0.18641091883182526, + "rewards/curriculum_aware_reward_fn/mean": 0.28749996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.319381445646286, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1749.0, + "completions/max_terminated_length": 1749.0, + "completions/mean_length": 462.4375305175781, + "completions/mean_terminated_length": 462.4375305175781, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.0918235749290688, + "grad_norm": 0.7837157845497131, + "kl": 0.100341796875, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 122365995.0, + "reward": 1.3571429252624512, + "reward_std": 0.21329350769519806, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.375, + "rewards/curriculum_aware_reward_fn/std": 0.3034338057041168, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1285.0, + "completions/max_terminated_length": 1285.0, + "completions/mean_length": 478.46429443359375, + "completions/mean_terminated_length": 478.46429443359375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.0928553004900696, + "grad_norm": 0.721355140209198, + "kl": 0.10205078125, + "learning_rate": 1e-06, + "loss": 0.0227, + "num_tokens": 122485243.0, + "reward": 1.3441966772079468, + "reward_std": 0.1735697239637375, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.31688809394836426, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1478.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 516.1785888671875, + "completions/mean_terminated_length": 516.1785888671875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.0938870260510705, + "grad_norm": 0.7072334885597229, + "kl": 0.093505859375, + "learning_rate": 1e-06, + "loss": -0.0312, + "num_tokens": 122614861.0, + "reward": 1.21875, + "reward_std": 0.15838447213172913, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21875, + "rewards/curriculum_aware_reward_fn/std": 0.27444660663604736, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1273.0, + "completions/max_terminated_length": 1273.0, + "completions/mean_length": 475.2232360839844, + "completions/mean_terminated_length": 475.2232360839844, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 1.094918751612071, + "grad_norm": 0.713330864906311, + "kl": 0.1046142578125, + "learning_rate": 1e-06, + "loss": 0.0283, + "num_tokens": 122733942.0, + "reward": 1.3187501430511475, + "reward_std": 0.11566664278507233, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31874996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31005120277404785, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 429.7589416503906, + "completions/mean_terminated_length": 429.7589416503906, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 1.095950477173072, + "grad_norm": 0.749390721321106, + "kl": 0.1058349609375, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 122845831.0, + "reward": 1.3468750715255737, + "reward_std": 0.15194597840309143, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.31688809394836426, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 467.26788330078125, + "completions/mean_terminated_length": 467.26788330078125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 1.0969822027340728, + "grad_norm": 0.8033453226089478, + "kl": 0.1021728515625, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 122960836.0, + "reward": 1.3250001668930054, + "reward_std": 0.17109976708889008, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32499998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.2997746765613556, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1742.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 470.5089416503906, + "completions/mean_terminated_length": 470.5089416503906, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 1.0980139282950736, + "grad_norm": 0.6588408350944519, + "kl": 0.0997314453125, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 123087166.0, + "reward": 1.2250001430511475, + "reward_std": 0.12828277051448822, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22499999403953552, + "rewards/curriculum_aware_reward_fn/std": 0.26097530126571655, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 419.01788330078125, + "completions/mean_terminated_length": 419.01788330078125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.0990456538560742, + "grad_norm": 0.5486484169960022, + "kl": 0.10107421875, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 123196354.0, + "reward": 1.2375000715255737, + "reward_std": 0.11234258115291595, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23749998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3964652419090271, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 418.40179443359375, + "completions/mean_terminated_length": 418.40179443359375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.100077379417075, + "grad_norm": 0.8537197709083557, + "kl": 0.1011962890625, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 123307951.0, + "reward": 1.3441966772079468, + "reward_std": 0.17055167257785797, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3062620460987091, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 426.20538330078125, + "completions/mean_terminated_length": 426.20538330078125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 1.101109104978076, + "grad_norm": 0.5966207981109619, + "kl": 0.09375, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 123415543.0, + "reward": 1.3531252145767212, + "reward_std": 0.10239209979772568, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 473.4732360839844, + "completions/mean_terminated_length": 473.4732360839844, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 1.1021408305390765, + "grad_norm": 0.6321438550949097, + "kl": 0.095947265625, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 123542415.0, + "reward": 1.2906250953674316, + "reward_std": 0.15328530967235565, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3004149794578552, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 418.8035888671875, + "completions/mean_terminated_length": 418.8035888671875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 1.1031725561000774, + "grad_norm": 0.7372820973396301, + "kl": 0.092529296875, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 123659993.0, + "reward": 1.3781250715255737, + "reward_std": 0.18130964040756226, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.33266472816467285, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1124.0, + "completions/max_terminated_length": 1124.0, + "completions/mean_length": 421.2500305175781, + "completions/mean_terminated_length": 421.2500305175781, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.1042042816610782, + "grad_norm": 0.7741332054138184, + "kl": 0.1090087890625, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 123777187.0, + "reward": 1.3035715818405151, + "reward_std": 0.1366921067237854, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.3267020583152771, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 491.6607360839844, + "completions/mean_terminated_length": 491.6607360839844, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 1.105236007222079, + "grad_norm": 0.6807509064674377, + "kl": 0.095703125, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 123906740.0, + "reward": 1.3093750476837158, + "reward_std": 0.20825700461864471, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.34442347288131714, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 449.08038330078125, + "completions/mean_terminated_length": 449.08038330078125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.1062677327830797, + "grad_norm": 0.7984329462051392, + "kl": 0.0904541015625, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 124021262.0, + "reward": 1.2843750715255737, + "reward_std": 0.1465483158826828, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29538729786872864, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 408.8125305175781, + "completions/mean_terminated_length": 408.8125305175781, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.1072994583440805, + "grad_norm": 0.6433510184288025, + "kl": 0.0985107421875, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 124138218.0, + "reward": 1.3406251668930054, + "reward_std": 0.10084687173366547, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34062498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.33040592074394226, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 449.169677734375, + "completions/mean_terminated_length": 449.169677734375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 1.1083311839050813, + "grad_norm": 0.671330451965332, + "kl": 0.0992431640625, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 124246324.0, + "reward": 1.2062500715255737, + "reward_std": 0.11124001443386078, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20624998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.291711688041687, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 467.982177734375, + "completions/mean_terminated_length": 467.982177734375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.109362909466082, + "grad_norm": 0.731284499168396, + "kl": 0.091552734375, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 124362501.0, + "reward": 1.3937503099441528, + "reward_std": 0.1432369202375412, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29759734869003296, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 1376.0, + "completions/mean_length": 439.544677734375, + "completions/mean_terminated_length": 439.544677734375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 1.1103946350270828, + "grad_norm": 0.7016761898994446, + "kl": 0.099365234375, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 124479735.0, + "reward": 1.325446605682373, + "reward_std": 0.12540650367736816, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31998246908187866, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1158.0, + "completions/mean_length": 470.46429443359375, + "completions/mean_terminated_length": 437.80181884765625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 1.1114263605880836, + "grad_norm": 0.8439191579818726, + "kl": 0.105224609375, + "learning_rate": 1e-06, + "loss": 0.0468, + "num_tokens": 124603386.0, + "reward": 1.2410714626312256, + "reward_std": 0.19769737124443054, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.2950034439563751, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 436.6696472167969, + "completions/mean_terminated_length": 436.6696472167969, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 1.1124580861490843, + "grad_norm": 0.7611454129219055, + "kl": 0.1112060546875, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 124723602.0, + "reward": 1.343750238418579, + "reward_std": 0.1612805426120758, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.2857048511505127, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1553.0, + "completions/max_terminated_length": 1553.0, + "completions/mean_length": 460.9107360839844, + "completions/mean_terminated_length": 460.9107360839844, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.113489811710085, + "grad_norm": 0.7447132468223572, + "kl": 0.1043701171875, + "learning_rate": 1e-06, + "loss": 0.0229, + "num_tokens": 124836908.0, + "reward": 1.3468750715255737, + "reward_std": 0.14876073598861694, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3371369242668152, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1044.0, + "completions/max_terminated_length": 1044.0, + "completions/mean_length": 474.58038330078125, + "completions/mean_terminated_length": 474.58038330078125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 1.114521537271086, + "grad_norm": 0.6706196665763855, + "kl": 0.09521484375, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 124957415.0, + "reward": 1.1812500953674316, + "reward_std": 0.12245100736618042, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.18124999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.23482069373130798, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1071.0, + "completions/max_terminated_length": 1071.0, + "completions/mean_length": 454.8482360839844, + "completions/mean_terminated_length": 454.8482360839844, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 1.1155532628320866, + "grad_norm": 0.7702577114105225, + "kl": 0.10888671875, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 125075964.0, + "reward": 1.3062502145767212, + "reward_std": 0.14353716373443604, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30625003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.29759737849235535, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1220.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 472.83929443359375, + "completions/mean_terminated_length": 472.83929443359375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 1.1165849883930874, + "grad_norm": 0.7469330430030823, + "kl": 0.1016845703125, + "learning_rate": 1e-06, + "loss": 0.039, + "num_tokens": 125196297.0, + "reward": 1.2718751430511475, + "reward_std": 0.15673062205314636, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.2960537075996399, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 425.40179443359375, + "completions/mean_terminated_length": 425.40179443359375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.1176167139540882, + "grad_norm": 0.7289387583732605, + "kl": 0.104248046875, + "learning_rate": 1e-06, + "loss": 0.0135, + "num_tokens": 125308650.0, + "reward": 1.3781250715255737, + "reward_std": 0.16956450045108795, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3359658122062683, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1262.0, + "completions/max_terminated_length": 1262.0, + "completions/mean_length": 470.9285888671875, + "completions/mean_terminated_length": 470.9285888671875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 1.118648439515089, + "grad_norm": 0.7212175130844116, + "kl": 0.1165771484375, + "learning_rate": 1e-06, + "loss": 0.0327, + "num_tokens": 125434313.0, + "reward": 1.2691967487335205, + "reward_std": 0.16892673075199127, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.32255885004997253, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1064.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 489.5089416503906, + "completions/mean_terminated_length": 489.5089416503906, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.1196801650760897, + "grad_norm": 0.7285465598106384, + "kl": 0.095458984375, + "learning_rate": 1e-06, + "loss": 0.0382, + "num_tokens": 125545943.0, + "reward": 1.2250001430511475, + "reward_std": 0.15566104650497437, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22500000894069672, + "rewards/curriculum_aware_reward_fn/std": 0.27737507224082947, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2961.0, + "completions/max_terminated_length": 2961.0, + "completions/mean_length": 472.9285888671875, + "completions/mean_terminated_length": 472.9285888671875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.1207118906370905, + "grad_norm": 0.8245320320129395, + "kl": 0.110595703125, + "learning_rate": 1e-06, + "loss": 0.0221, + "num_tokens": 125676408.0, + "reward": 1.2004464864730835, + "reward_std": 0.23227369785308838, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.20937499403953552, + "rewards/curriculum_aware_reward_fn/std": 0.28754404187202454, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1224.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 438.2410888671875, + "completions/mean_terminated_length": 438.2410888671875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.1217436161980914, + "grad_norm": 0.5570046305656433, + "kl": 0.1021728515625, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 125790745.0, + "reward": 1.3468750715255737, + "reward_std": 0.0733615979552269, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.4082086384296417, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1317.0, + "completions/max_terminated_length": 1317.0, + "completions/mean_length": 488.6250305175781, + "completions/mean_terminated_length": 488.6250305175781, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.122775341759092, + "grad_norm": 0.7493662238121033, + "kl": 0.102294921875, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 125925019.0, + "reward": 1.28125, + "reward_std": 0.14943061769008636, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.32153382897377014, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 466.40179443359375, + "completions/mean_terminated_length": 466.40179443359375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 1.1238070673200928, + "grad_norm": 0.8465061783790588, + "kl": 0.1046142578125, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 126049384.0, + "reward": 1.359375238418579, + "reward_std": 0.2215120792388916, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.316763699054718, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 463.5357360839844, + "completions/mean_terminated_length": 463.5357360839844, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.1248387928810937, + "grad_norm": 0.6302226781845093, + "kl": 0.0997314453125, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 126165609.0, + "reward": 1.2973216772079468, + "reward_std": 0.11076794564723969, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3120785355567932, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 399.1875305175781, + "completions/mean_terminated_length": 399.1875305175781, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.1258705184420945, + "grad_norm": 0.7528978586196899, + "kl": 0.11376953125, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 126273871.0, + "reward": 1.3937500715255737, + "reward_std": 0.12978744506835938, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.32251301407814026, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 437.52679443359375, + "completions/mean_terminated_length": 437.52679443359375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.1269022440030951, + "grad_norm": 0.81932133436203, + "kl": 0.1043701171875, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 126389619.0, + "reward": 1.3781250715255737, + "reward_std": 0.15330146253108978, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29391586780548096, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 424.6875305175781, + "completions/mean_terminated_length": 424.6875305175781, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.127933969564096, + "grad_norm": 0.7560570240020752, + "kl": 0.108154296875, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 126497923.0, + "reward": 1.3000000715255737, + "reward_std": 0.1389334499835968, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.33173036575317383, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1163.0, + "completions/max_terminated_length": 1163.0, + "completions/mean_length": 490.2500305175781, + "completions/mean_terminated_length": 490.2500305175781, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 1.1289656951250968, + "grad_norm": 0.5553421378135681, + "kl": 0.0921630859375, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 126621102.0, + "reward": 1.2156251668930054, + "reward_std": 0.1150357574224472, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21562500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.29431790113449097, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 465.0000305175781, + "completions/mean_terminated_length": 465.0000305175781, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.1299974206860974, + "grad_norm": 0.5771997570991516, + "kl": 0.104248046875, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 126745716.0, + "reward": 1.3218752145767212, + "reward_std": 0.10599697381258011, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.34567996859550476, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 448.8839416503906, + "completions/mean_terminated_length": 448.8839416503906, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.1310291462470983, + "grad_norm": 0.6750321388244629, + "kl": 0.112548828125, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 126864234.0, + "reward": 1.3625000715255737, + "reward_std": 0.12350915372371674, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3005625307559967, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 448.0982360839844, + "completions/mean_terminated_length": 448.0982360839844, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.1320608718080991, + "grad_norm": 0.7752856016159058, + "kl": 0.105712890625, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 126983042.0, + "reward": 1.2531250715255737, + "reward_std": 0.15456053614616394, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25312498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.30882522463798523, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1640.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 518.357177734375, + "completions/mean_terminated_length": 518.357177734375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.1330925973690997, + "grad_norm": 0.6677262187004089, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": -0.0171, + "num_tokens": 127120947.0, + "reward": 1.2125000953674316, + "reward_std": 0.16567465662956238, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.29845699667930603, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 428.7232360839844, + "completions/mean_terminated_length": 428.7232360839844, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 1.1341243229301006, + "grad_norm": 0.8252878785133362, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 127239447.0, + "reward": 1.2843750715255737, + "reward_std": 0.15611320734024048, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28437498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.2991000711917877, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 451.3035888671875, + "completions/mean_terminated_length": 451.3035888671875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 1.1351560484911014, + "grad_norm": 0.7429160475730896, + "kl": 0.1077880859375, + "learning_rate": 1e-06, + "loss": 0.0243, + "num_tokens": 127356577.0, + "reward": 1.2875001430511475, + "reward_std": 0.1438807249069214, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3159071207046509, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 416.5535888671875, + "completions/mean_terminated_length": 416.5535888671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.136187774052102, + "grad_norm": 1.1605981588363647, + "kl": 0.179931640625, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 127472601.0, + "reward": 1.3343751430511475, + "reward_std": 0.12120731920003891, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.32680758833885193, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 434.2589416503906, + "completions/mean_terminated_length": 434.2589416503906, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 1.1372194996131029, + "grad_norm": 0.8007761836051941, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 127584574.0, + "reward": 1.2375000715255737, + "reward_std": 0.12666957080364227, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23749998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.28272324800491333, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 466.794677734375, + "completions/mean_terminated_length": 466.794677734375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 1.1382512251741037, + "grad_norm": 0.746719241142273, + "kl": 0.109619140625, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 127712301.0, + "reward": 1.3035715818405151, + "reward_std": 0.1359844207763672, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.31987470388412476, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1111.0, + "completions/max_terminated_length": 1111.0, + "completions/mean_length": 440.1339416503906, + "completions/mean_terminated_length": 440.1339416503906, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 1.1392829507351045, + "grad_norm": 0.7280793786048889, + "kl": 0.105712890625, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 127819759.0, + "reward": 1.3468750715255737, + "reward_std": 0.1580439656972885, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3098445534706116, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 476.0535888671875, + "completions/mean_terminated_length": 476.0535888671875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 1.1403146762961052, + "grad_norm": 0.7410470247268677, + "kl": 0.0887451171875, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 127929350.0, + "reward": 1.2937501668930054, + "reward_std": 0.15814347565174103, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.32397621870040894, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1237.0, + "completions/max_terminated_length": 1237.0, + "completions/mean_length": 480.1785888671875, + "completions/mean_terminated_length": 480.1785888671875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 1.141346401857106, + "grad_norm": 0.7002902626991272, + "kl": 0.0968017578125, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 128052206.0, + "reward": 1.262500286102295, + "reward_std": 0.15251708030700684, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26250001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.29527053236961365, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 426.76788330078125, + "completions/mean_terminated_length": 426.76788330078125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 1.1423781274181068, + "grad_norm": 0.8631582260131836, + "kl": 0.0955810546875, + "learning_rate": 1e-06, + "loss": 0.0142, + "num_tokens": 128162507.0, + "reward": 1.4031251668930054, + "reward_std": 0.19521509110927582, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40312498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.29791173338890076, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 456.669677734375, + "completions/mean_terminated_length": 456.669677734375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 1.1434098529791075, + "grad_norm": 0.6821380853652954, + "kl": 0.0904541015625, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 128281260.0, + "reward": 1.3531252145767212, + "reward_std": 0.1475609540939331, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3271692097187042, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 420.45538330078125, + "completions/mean_terminated_length": 420.45538330078125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.1444415785401083, + "grad_norm": 0.7330096960067749, + "kl": 0.10400390625, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 128395376.0, + "reward": 1.4156252145767212, + "reward_std": 0.16615553200244904, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4156249463558197, + "rewards/curriculum_aware_reward_fn/std": 0.3204748034477234, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 438.5446472167969, + "completions/mean_terminated_length": 438.5446472167969, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 1.1454733041011091, + "grad_norm": 0.6928600668907166, + "kl": 0.096923828125, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 128512547.0, + "reward": 1.3531252145767212, + "reward_std": 0.11022058129310608, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3062620460987091, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 437.52679443359375, + "completions/mean_terminated_length": 437.52679443359375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 1.14650502966211, + "grad_norm": 0.6906600594520569, + "kl": 0.09228515625, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 128622716.0, + "reward": 1.3468750715255737, + "reward_std": 0.10169928520917892, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.29896828532218933, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1141.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 480.357177734375, + "completions/mean_terminated_length": 480.357177734375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 1.1475367552231106, + "grad_norm": 0.8089150190353394, + "kl": 0.10302734375, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 128747160.0, + "reward": 1.2281250953674316, + "reward_std": 0.16455334424972534, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.27277201414108276, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1058.0, + "completions/max_terminated_length": 1058.0, + "completions/mean_length": 465.46429443359375, + "completions/mean_terminated_length": 465.46429443359375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.1485684807841114, + "grad_norm": 0.7459560632705688, + "kl": 0.09130859375, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 128868095.0, + "reward": 1.3218750953674316, + "reward_std": 0.15609592199325562, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218749463558197, + "rewards/curriculum_aware_reward_fn/std": 0.32596227526664734, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 489.3035888671875, + "completions/mean_terminated_length": 489.3035888671875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 1.1496002063451123, + "grad_norm": 0.7976208329200745, + "kl": 0.09814453125, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 128986842.0, + "reward": 1.2781251668930054, + "reward_std": 0.1544256955385208, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.2663382887840271, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1825.0, + "completions/max_terminated_length": 1825.0, + "completions/mean_length": 467.4285888671875, + "completions/mean_terminated_length": 467.4285888671875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.150631931906113, + "grad_norm": 0.6780887246131897, + "kl": 0.0997314453125, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 129100472.0, + "reward": 1.296875, + "reward_std": 0.13180279731750488, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3193660080432892, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 450.52679443359375, + "completions/mean_terminated_length": 450.52679443359375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.1516636574671137, + "grad_norm": 0.8171302676200867, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": 0.0324, + "num_tokens": 129214050.0, + "reward": 1.3812501430511475, + "reward_std": 0.19573992490768433, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38124996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3273649215698242, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 428.8482360839844, + "completions/mean_terminated_length": 428.8482360839844, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 1.1526953830281146, + "grad_norm": 0.8667630553245544, + "kl": 0.1043701171875, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 129325102.0, + "reward": 1.3531250953674316, + "reward_std": 0.1513831466436386, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3531250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.29525381326675415, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 421.7500305175781, + "completions/mean_terminated_length": 421.7500305175781, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.1537271085891152, + "grad_norm": 1.1657435894012451, + "kl": 0.181884765625, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 129433000.0, + "reward": 1.1968752145767212, + "reward_std": 0.11701443046331406, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2648543119430542, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 428.83929443359375, + "completions/mean_terminated_length": 428.83929443359375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 1.154758834150116, + "grad_norm": 0.6912339329719543, + "kl": 0.1046142578125, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 129538017.0, + "reward": 1.303125023841858, + "reward_std": 0.1794067919254303, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3305251896381378, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 479.6160888671875, + "completions/mean_terminated_length": 479.6160888671875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.1557905597111169, + "grad_norm": 0.639657735824585, + "kl": 0.095458984375, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 129657219.0, + "reward": 1.265625, + "reward_std": 0.08700991421937943, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.265625, + "rewards/curriculum_aware_reward_fn/std": 0.30895283818244934, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 432.3750305175781, + "completions/mean_terminated_length": 432.3750305175781, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.1568222852721175, + "grad_norm": 0.8700076937675476, + "kl": 0.10205078125, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 129760644.0, + "reward": 1.372321605682373, + "reward_std": 0.23868943750858307, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3812499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3028486967086792, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1218.0, + "completions/max_terminated_length": 1218.0, + "completions/mean_length": 449.9732360839844, + "completions/mean_terminated_length": 449.9732360839844, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 1.1578540108331183, + "grad_norm": 0.634154200553894, + "kl": 0.091552734375, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 129867776.0, + "reward": 1.2879465818405151, + "reward_std": 0.14574995636940002, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.32956981658935547, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 455.40179443359375, + "completions/mean_terminated_length": 455.40179443359375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.1588857363941192, + "grad_norm": 0.7961022853851318, + "kl": 0.1005859375, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 129992561.0, + "reward": 1.278125286102295, + "reward_std": 0.17692962288856506, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3703407943248749, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 907.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 455.89288330078125, + "completions/mean_terminated_length": 455.89288330078125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 1.15991746195512, + "grad_norm": 0.5796977877616882, + "kl": 0.0980224609375, + "learning_rate": 1e-06, + "loss": 0.0423, + "num_tokens": 130104493.0, + "reward": 1.453125238418579, + "reward_std": 0.09935219585895538, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.453125, + "rewards/curriculum_aware_reward_fn/std": 0.30315765738487244, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 468.4732360839844, + "completions/mean_terminated_length": 468.4732360839844, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 1.1609491875161206, + "grad_norm": 0.7204039096832275, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 130221229.0, + "reward": 1.2281252145767212, + "reward_std": 0.12669439613819122, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22812499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2807471752166748, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 444.0000305175781, + "completions/mean_terminated_length": 444.0000305175781, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.1619809130771215, + "grad_norm": 0.6680362224578857, + "kl": 0.0992431640625, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 130342861.0, + "reward": 1.3093751668930054, + "reward_std": 0.10159022361040115, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.38382449746131897, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 510.0982360839844, + "completions/mean_terminated_length": 510.0982360839844, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.1630126386381223, + "grad_norm": 0.5835331082344055, + "kl": 0.0870361328125, + "learning_rate": 1e-06, + "loss": 0.0252, + "num_tokens": 130476047.0, + "reward": 1.3125, + "reward_std": 0.1379421055316925, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3125, + "rewards/curriculum_aware_reward_fn/std": 0.3233064115047455, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 427.7232360839844, + "completions/mean_terminated_length": 427.7232360839844, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.164044364199123, + "grad_norm": 0.724087655544281, + "kl": 0.0877685546875, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 130585853.0, + "reward": 1.3406251668930054, + "reward_std": 0.11424807459115982, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34062498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3025068938732147, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1391.0, + "completions/max_terminated_length": 1391.0, + "completions/mean_length": 467.46429443359375, + "completions/mean_terminated_length": 467.46429443359375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.1650760897601238, + "grad_norm": 0.6006419658660889, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 130704058.0, + "reward": 1.3656251430511475, + "reward_std": 0.11053341627120972, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36562496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3165147304534912, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 446.83929443359375, + "completions/mean_terminated_length": 446.83929443359375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 1.1661078153211246, + "grad_norm": 0.7218096256256104, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 130818147.0, + "reward": 1.265625238418579, + "reward_std": 0.143123596906662, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.265625, + "rewards/curriculum_aware_reward_fn/std": 0.2867204546928406, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 389.6250305175781, + "completions/mean_terminated_length": 389.6250305175781, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 1.1671395408821255, + "grad_norm": 0.5740315318107605, + "kl": 0.1015625, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 130919514.0, + "reward": 1.4406250715255737, + "reward_std": 0.12512782216072083, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3280114233493805, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 477.8035888671875, + "completions/mean_terminated_length": 477.8035888671875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.168171266443126, + "grad_norm": 0.686428964138031, + "kl": 0.08837890625, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 131040466.0, + "reward": 1.3093751668930054, + "reward_std": 0.1683778315782547, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3570103049278259, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 465.3125305175781, + "completions/mean_terminated_length": 465.3125305175781, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.169202992004127, + "grad_norm": 0.6112284064292908, + "kl": 0.089599609375, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 131163757.0, + "reward": 1.3500001430511475, + "reward_std": 0.1343052089214325, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3080745041370392, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 451.64288330078125, + "completions/mean_terminated_length": 451.64288330078125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 1.1702347175651278, + "grad_norm": 0.605445384979248, + "kl": 0.0877685546875, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 131272287.0, + "reward": 1.3031251430511475, + "reward_std": 0.12129982560873032, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31338611245155334, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 442.544677734375, + "completions/mean_terminated_length": 442.544677734375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.1712664431261284, + "grad_norm": 0.6315911412239075, + "kl": 0.0882568359375, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 131384250.0, + "reward": 1.2875001430511475, + "reward_std": 0.15498583018779755, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2874999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3295847773551941, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 429.51788330078125, + "completions/mean_terminated_length": 429.51788330078125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 1.1722981686871292, + "grad_norm": 0.6539668440818787, + "kl": 0.0860595703125, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 131501355.0, + "reward": 1.3468750715255737, + "reward_std": 0.16448518633842468, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3237784504890442, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 424.40179443359375, + "completions/mean_terminated_length": 424.40179443359375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.17332989424813, + "grad_norm": 0.7477120757102966, + "kl": 0.0941162109375, + "learning_rate": 1e-06, + "loss": -0.0238, + "num_tokens": 131618440.0, + "reward": 1.3593751192092896, + "reward_std": 0.13173705339431763, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.32704871892929077, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 504.96429443359375, + "completions/mean_terminated_length": 504.96429443359375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 1.1743616198091307, + "grad_norm": 0.670097827911377, + "kl": 0.08349609375, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 131740838.0, + "reward": 1.3191965818405151, + "reward_std": 0.15710267424583435, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.328125, + "rewards/curriculum_aware_reward_fn/std": 0.3090803623199463, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1451.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 507.232177734375, + "completions/mean_terminated_length": 507.232177734375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.1753933453701315, + "grad_norm": 0.5765451192855835, + "kl": 0.0892333984375, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 131864582.0, + "reward": 1.3218752145767212, + "reward_std": 0.08672605454921722, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.40724197030067444, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 449.5089416503906, + "completions/mean_terminated_length": 449.5089416503906, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.1764250709311324, + "grad_norm": 0.6205755472183228, + "kl": 0.0936279296875, + "learning_rate": 1e-06, + "loss": 0.012, + "num_tokens": 131986236.0, + "reward": 1.403125286102295, + "reward_std": 0.13763344287872314, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40312498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.29040831327438354, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 464.6160888671875, + "completions/mean_terminated_length": 464.6160888671875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.177456796492133, + "grad_norm": 0.794878363609314, + "kl": 0.0904541015625, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 132106362.0, + "reward": 1.3500001430511475, + "reward_std": 0.1489112824201584, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.30082470178604126, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1085.0, + "completions/max_terminated_length": 1085.0, + "completions/mean_length": 484.5625305175781, + "completions/mean_terminated_length": 484.5625305175781, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.1784885220531338, + "grad_norm": 0.6890538334846497, + "kl": 0.0811767578125, + "learning_rate": 1e-06, + "loss": -0.0144, + "num_tokens": 132228820.0, + "reward": 1.3093751668930054, + "reward_std": 0.2188614159822464, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.31426528096199036, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 487.107177734375, + "completions/mean_terminated_length": 487.107177734375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.1795202476141347, + "grad_norm": 0.6225954294204712, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 132355595.0, + "reward": 1.2562501430511475, + "reward_std": 0.14356856048107147, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25624996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31157293915748596, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 508.8035888671875, + "completions/mean_terminated_length": 508.8035888671875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.1805519731751355, + "grad_norm": 0.6063207983970642, + "kl": 0.081298828125, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 132481216.0, + "reward": 1.3156250715255737, + "reward_std": 0.16792182624340057, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.30067723989486694, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 506.58038330078125, + "completions/mean_terminated_length": 506.58038330078125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 1.1815836987361361, + "grad_norm": 0.7089967727661133, + "kl": 0.086181640625, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 132613077.0, + "reward": 1.2750000953674316, + "reward_std": 0.14475411176681519, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.2950034439563751, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 506.5625305175781, + "completions/mean_terminated_length": 506.5625305175781, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 1.182615424297137, + "grad_norm": 0.7573201060295105, + "kl": 0.088134765625, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 132742346.0, + "reward": 1.2906250953674316, + "reward_std": 0.20178672671318054, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.29297566413879395, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 440.8660888671875, + "completions/mean_terminated_length": 440.8660888671875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 1.1836471498581378, + "grad_norm": 0.7480193972587585, + "kl": 0.093017578125, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 132852147.0, + "reward": 1.3343751430511475, + "reward_std": 0.20899318158626556, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.32680758833885193, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 496.732177734375, + "completions/mean_terminated_length": 496.732177734375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 1.1846788754191384, + "grad_norm": 0.6407890915870667, + "kl": 0.0936279296875, + "learning_rate": 1e-06, + "loss": 0.0205, + "num_tokens": 132985838.0, + "reward": 1.2375000715255737, + "reward_std": 0.17742672562599182, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23749999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.27879244089126587, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 507.39288330078125, + "completions/mean_terminated_length": 507.39288330078125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 1.1857106009801393, + "grad_norm": 0.5715434551239014, + "kl": 0.0921630859375, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 133107850.0, + "reward": 1.2723214626312256, + "reward_std": 0.1283775120973587, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2812499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.2965359389781952, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 489.51788330078125, + "completions/mean_terminated_length": 489.51788330078125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 1.18674232654114, + "grad_norm": 0.674042284488678, + "kl": 0.1044921875, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 133230660.0, + "reward": 1.2312501668930054, + "reward_std": 0.18054217100143433, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23124997317790985, + "rewards/curriculum_aware_reward_fn/std": 0.31005120277404785, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 504.2857360839844, + "completions/mean_terminated_length": 504.2857360839844, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 1.187774052102141, + "grad_norm": 0.6435889601707458, + "kl": 0.1002197265625, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 133355964.0, + "reward": 1.2593750953674316, + "reward_std": 0.16222849488258362, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2593750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.28863856196403503, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 517.8035888671875, + "completions/mean_terminated_length": 517.8035888671875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 1.1888057776631415, + "grad_norm": 0.630872905254364, + "kl": 0.093017578125, + "learning_rate": 1e-06, + "loss": 0.0268, + "num_tokens": 133491259.0, + "reward": 1.2750002145767212, + "reward_std": 0.16553469002246857, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2750000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.29872098565101624, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 518.6785888671875, + "completions/mean_terminated_length": 518.6785888671875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 1.1898375032241424, + "grad_norm": 0.7435547709465027, + "kl": 0.08837890625, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 133623901.0, + "reward": 1.28125, + "reward_std": 0.20633579790592194, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28125, + "rewards/curriculum_aware_reward_fn/std": 0.3038880527019501, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1219.0, + "completions/max_terminated_length": 1219.0, + "completions/mean_length": 507.5357360839844, + "completions/mean_terminated_length": 507.5357360839844, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 1.1908692287851432, + "grad_norm": 0.5446999073028564, + "kl": 0.0982666015625, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 133742863.0, + "reward": 1.3687502145767212, + "reward_std": 0.10711290687322617, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3687500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3249480128288269, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 473.794677734375, + "completions/mean_terminated_length": 473.794677734375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 1.1919009543461438, + "grad_norm": 0.7982268929481506, + "kl": 0.09130859375, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 133861700.0, + "reward": 1.3062500953674316, + "reward_std": 0.23682567477226257, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3012829124927521, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 514.1160888671875, + "completions/mean_terminated_length": 514.1160888671875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 1.1929326799071447, + "grad_norm": 0.7393704056739807, + "kl": 0.0997314453125, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 133983289.0, + "reward": 1.3062500953674316, + "reward_std": 0.2049076408147812, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3062500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3049239218235016, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 544.544677734375, + "completions/mean_terminated_length": 544.544677734375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 1.1939644054681455, + "grad_norm": 0.7494214773178101, + "kl": 0.0885009765625, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 134107782.0, + "reward": 1.2125000953674316, + "reward_std": 0.1954411119222641, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21249999105930328, + "rewards/curriculum_aware_reward_fn/std": 0.2672431766986847, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 550.5089721679688, + "completions/mean_terminated_length": 550.5089721679688, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 1.1949961310291464, + "grad_norm": 0.6580347418785095, + "kl": 0.0845947265625, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 134237185.0, + "reward": 1.2312501668930054, + "reward_std": 0.18060758709907532, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23124997317790985, + "rewards/curriculum_aware_reward_fn/std": 0.2840445339679718, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1190.0, + "completions/max_terminated_length": 1190.0, + "completions/mean_length": 512.4464721679688, + "completions/mean_terminated_length": 512.4464721679688, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.196027856590147, + "grad_norm": 0.6041538715362549, + "kl": 0.091552734375, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 134362367.0, + "reward": 1.3562500476837158, + "reward_std": 0.15172560513019562, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35624998807907104, + "rewards/curriculum_aware_reward_fn/std": 0.31157293915748596, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1382.0, + "completions/max_terminated_length": 1382.0, + "completions/mean_length": 489.9285888671875, + "completions/mean_terminated_length": 489.9285888671875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 1.1970595821511478, + "grad_norm": 0.6599896550178528, + "kl": 0.107421875, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 134485258.0, + "reward": 1.3625000715255737, + "reward_std": 0.165399968624115, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.33196792006492615, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1643.0, + "completions/max_terminated_length": 1643.0, + "completions/mean_length": 527.1160888671875, + "completions/mean_terminated_length": 527.1160888671875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 1.1980913077121484, + "grad_norm": 0.6196821928024292, + "kl": 0.08740234375, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 134609682.0, + "reward": 1.3250001668930054, + "reward_std": 0.13873064517974854, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3279062509536743, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 907.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 443.4375305175781, + "completions/mean_terminated_length": 443.4375305175781, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 1.1991230332731493, + "grad_norm": 0.7619950175285339, + "kl": 0.103515625, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 134717580.0, + "reward": 1.328125, + "reward_std": 0.16038528084754944, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.328125, + "rewards/curriculum_aware_reward_fn/std": 0.3230472207069397, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 505.83038330078125, + "completions/mean_terminated_length": 505.83038330078125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 1.2001547588341501, + "grad_norm": 0.6503728628158569, + "kl": 0.1016845703125, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 134836596.0, + "reward": 1.3031251430511475, + "reward_std": 0.15563803911209106, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30312496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.3062620460987091, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 492.02679443359375, + "completions/mean_terminated_length": 492.02679443359375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 1.201186484395151, + "grad_norm": 0.5857566595077515, + "kl": 0.08642578125, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 134955641.0, + "reward": 1.441071629524231, + "reward_std": 0.185373455286026, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44107145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.44210872054100037, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1654.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 498.5714416503906, + "completions/mean_terminated_length": 498.5714416503906, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 1.2022182099561516, + "grad_norm": 0.6004607677459717, + "kl": 0.0880126953125, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 135080144.0, + "reward": 1.3633930683135986, + "reward_std": 0.11078914254903793, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3633928596973419, + "rewards/curriculum_aware_reward_fn/std": 0.4917326867580414, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 505.2589416503906, + "completions/mean_terminated_length": 505.2589416503906, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.2032499355171524, + "grad_norm": 0.5843690633773804, + "kl": 0.09912109375, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 135206012.0, + "reward": 1.3232144117355347, + "reward_std": 0.14009526371955872, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32321426272392273, + "rewards/curriculum_aware_reward_fn/std": 0.3698629140853882, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 463.76788330078125, + "completions/mean_terminated_length": 463.76788330078125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 1.2042816610781533, + "grad_norm": 0.7525951266288757, + "kl": 0.104248046875, + "learning_rate": 1e-06, + "loss": -0.0168, + "num_tokens": 135324230.0, + "reward": 1.4459823369979858, + "reward_std": 0.17760148644447327, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4459821581840515, + "rewards/curriculum_aware_reward_fn/std": 0.37145552039146423, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 523.8928833007812, + "completions/mean_terminated_length": 523.8928833007812, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 1.2053133866391539, + "grad_norm": 0.630405604839325, + "kl": 0.078125, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 135443148.0, + "reward": 1.3714287281036377, + "reward_std": 0.1765255331993103, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.38035711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.3761083781719208, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 400.2857360839844, + "completions/mean_terminated_length": 400.2857360839844, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 1.2063451122001547, + "grad_norm": 0.7302508354187012, + "kl": 0.0921630859375, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 135543560.0, + "reward": 1.5000003576278687, + "reward_std": 0.13220220804214478, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5000000596046448, + "rewards/curriculum_aware_reward_fn/std": 0.3808477818965912, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 493.9285888671875, + "completions/mean_terminated_length": 493.9285888671875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 1.2073768377611556, + "grad_norm": 0.734142005443573, + "kl": 0.0927734375, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 135665324.0, + "reward": 1.369642972946167, + "reward_std": 0.21060726046562195, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36964288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.34303608536720276, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 515.5982666015625, + "completions/mean_terminated_length": 515.5982666015625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 1.2084085633221564, + "grad_norm": 0.7119922637939453, + "kl": 0.0936279296875, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 135788927.0, + "reward": 1.3933037519454956, + "reward_std": 0.23654146492481232, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.40223217010498047, + "rewards/curriculum_aware_reward_fn/std": 0.38181573152542114, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2014.0, + "completions/max_terminated_length": 2014.0, + "completions/mean_length": 584.2053833007812, + "completions/mean_terminated_length": 584.2053833007812, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 1.209440288883157, + "grad_norm": 0.7124395966529846, + "kl": 0.0806884765625, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 135935817.0, + "reward": 1.339285969734192, + "reward_std": 0.21949851512908936, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3392857015132904, + "rewards/curriculum_aware_reward_fn/std": 0.32427525520324707, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 549.9910888671875, + "completions/mean_terminated_length": 549.9910888671875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 1.2104720144441579, + "grad_norm": 0.4604604244232178, + "kl": 0.08837890625, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 136064917.0, + "reward": 1.3205358982086182, + "reward_std": 0.0818769633769989, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32053571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.36394694447517395, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 499.6607360839844, + "completions/mean_terminated_length": 499.6607360839844, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 1.2115037400051587, + "grad_norm": 0.6104110479354858, + "kl": 0.0931396484375, + "learning_rate": 1e-06, + "loss": 0.032, + "num_tokens": 136200364.0, + "reward": 1.3250001668930054, + "reward_std": 0.21099622547626495, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3657030463218689, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1051.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 505.9285888671875, + "completions/mean_terminated_length": 505.9285888671875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 1.2125354655661593, + "grad_norm": 0.6829614639282227, + "kl": 0.09326171875, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 136324074.0, + "reward": 1.2803572416305542, + "reward_std": 0.1924617439508438, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28035715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.33694157004356384, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 505.2857360839844, + "completions/mean_terminated_length": 505.2857360839844, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.2135671911271602, + "grad_norm": 0.6645453572273254, + "kl": 0.0894775390625, + "learning_rate": 1e-06, + "loss": -0.0119, + "num_tokens": 136454971.0, + "reward": 1.345982313156128, + "reward_std": 0.13870622217655182, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3459821343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3621842563152313, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 471.7857360839844, + "completions/mean_terminated_length": 471.7857360839844, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 1.214598916688161, + "grad_norm": 0.680292546749115, + "kl": 0.0860595703125, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 136577650.0, + "reward": 1.4754464626312256, + "reward_std": 0.15931636095046997, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4754464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3732614517211914, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1194.0, + "completions/max_terminated_length": 1194.0, + "completions/mean_length": 470.232177734375, + "completions/mean_terminated_length": 470.232177734375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 1.2156306422491618, + "grad_norm": 0.5292847752571106, + "kl": 0.087158203125, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 136711404.0, + "reward": 1.441071629524231, + "reward_std": 0.13028402626514435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44107145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.3854454755783081, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1195.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 458.732177734375, + "completions/mean_terminated_length": 458.732177734375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 1.2166623678101625, + "grad_norm": 0.645519495010376, + "kl": 0.09423828125, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 136827679.0, + "reward": 1.5263392925262451, + "reward_std": 0.19883407652378082, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5352678894996643, + "rewards/curriculum_aware_reward_fn/std": 0.46443992853164673, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 496.83038330078125, + "completions/mean_terminated_length": 496.83038330078125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.2176940933711633, + "grad_norm": 0.6841337084770203, + "kl": 0.0869140625, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 136954927.0, + "reward": 1.3924108743667603, + "reward_std": 0.194316565990448, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3924107253551483, + "rewards/curriculum_aware_reward_fn/std": 0.3710351884365082, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 459.419677734375, + "completions/mean_terminated_length": 459.419677734375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.2187258189321641, + "grad_norm": 0.576664388179779, + "kl": 0.091796875, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 137073512.0, + "reward": 1.3700894117355347, + "reward_std": 0.1372222900390625, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3700892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.35926979780197144, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 487.33038330078125, + "completions/mean_terminated_length": 487.33038330078125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.2197575444931648, + "grad_norm": 0.6316289305686951, + "kl": 0.0909423828125, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 137192283.0, + "reward": 1.385267972946167, + "reward_std": 0.10963393747806549, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38526788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3733002543449402, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 414.46429443359375, + "completions/mean_terminated_length": 414.46429443359375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.2207892700541656, + "grad_norm": 0.6138463616371155, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 137300758.0, + "reward": 1.5013395547866821, + "reward_std": 0.10109592229127884, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.501339316368103, + "rewards/curriculum_aware_reward_fn/std": 0.37092897295951843, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1448.0, + "completions/max_terminated_length": 1448.0, + "completions/mean_length": 524.9732666015625, + "completions/mean_terminated_length": 524.9732666015625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 1.2218209956151664, + "grad_norm": 0.6145269274711609, + "kl": 0.0877685546875, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 137432362.0, + "reward": 1.2441965341567993, + "reward_std": 0.11928559094667435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24419642984867096, + "rewards/curriculum_aware_reward_fn/std": 0.3394315540790558, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 470.33929443359375, + "completions/mean_terminated_length": 470.33929443359375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 1.222852721176167, + "grad_norm": 0.6667640805244446, + "kl": 0.090576171875, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 137550063.0, + "reward": 1.520535945892334, + "reward_std": 0.2047668844461441, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5205357074737549, + "rewards/curriculum_aware_reward_fn/std": 0.47998470067977905, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1110.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 497.294677734375, + "completions/mean_terminated_length": 497.294677734375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 1.223884446737168, + "grad_norm": 0.7046597003936768, + "kl": 0.0909423828125, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 137672029.0, + "reward": 1.4477680921554565, + "reward_std": 0.18011516332626343, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44776788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3280702829360962, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 492.33929443359375, + "completions/mean_terminated_length": 492.33929443359375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 1.2249161722981687, + "grad_norm": 0.5538958311080933, + "kl": 0.0887451171875, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 137799446.0, + "reward": 1.5080358982086182, + "reward_std": 0.12931200861930847, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5080357193946838, + "rewards/curriculum_aware_reward_fn/std": 0.42296433448791504, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 459.9107360839844, + "completions/mean_terminated_length": 459.9107360839844, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 1.2259478978591694, + "grad_norm": 0.753014326095581, + "kl": 0.091064453125, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 137922853.0, + "reward": 1.4089287519454956, + "reward_std": 0.19899839162826538, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40892860293388367, + "rewards/curriculum_aware_reward_fn/std": 0.3784281611442566, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1130.0, + "completions/max_terminated_length": 1130.0, + "completions/mean_length": 473.27679443359375, + "completions/mean_terminated_length": 473.27679443359375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 1.2269796234201702, + "grad_norm": 0.6941073536872864, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": 0.0333, + "num_tokens": 138040449.0, + "reward": 1.4625002145767212, + "reward_std": 0.16561460494995117, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46250003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.38022634387016296, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1111.0, + "completions/max_terminated_length": 1111.0, + "completions/mean_length": 498.8125305175781, + "completions/mean_terminated_length": 498.8125305175781, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.228011348981171, + "grad_norm": 0.691802978515625, + "kl": 0.0828857421875, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 138156019.0, + "reward": 1.3794643878936768, + "reward_std": 0.23291635513305664, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3794642984867096, + "rewards/curriculum_aware_reward_fn/std": 0.3979991376399994, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 450.7232360839844, + "completions/mean_terminated_length": 450.7232360839844, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.2290430745421719, + "grad_norm": 0.5977465510368347, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 138269297.0, + "reward": 1.4200893640518188, + "reward_std": 0.16302047669887543, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4200893044471741, + "rewards/curriculum_aware_reward_fn/std": 0.4009819030761719, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 479.83929443359375, + "completions/mean_terminated_length": 479.83929443359375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 1.2300748001031725, + "grad_norm": 0.6863408088684082, + "kl": 0.094970703125, + "learning_rate": 1e-06, + "loss": 0.0179, + "num_tokens": 138391751.0, + "reward": 1.440178632736206, + "reward_std": 0.16329091787338257, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44017860293388367, + "rewards/curriculum_aware_reward_fn/std": 0.3516908586025238, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 510.39288330078125, + "completions/mean_terminated_length": 510.39288330078125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.2311065256641733, + "grad_norm": 0.5389936566352844, + "kl": 0.086669921875, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 138518957.0, + "reward": 1.3169643878936768, + "reward_std": 0.12550340592861176, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3169642984867096, + "rewards/curriculum_aware_reward_fn/std": 0.35141628980636597, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1196.0, + "completions/max_terminated_length": 1196.0, + "completions/mean_length": 497.8482360839844, + "completions/mean_terminated_length": 497.8482360839844, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 1.2321382512251742, + "grad_norm": 0.7427835464477539, + "kl": 0.09423828125, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 138641720.0, + "reward": 1.3638393878936768, + "reward_std": 0.2322855442762375, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3638392984867096, + "rewards/curriculum_aware_reward_fn/std": 0.36392563581466675, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1054.0, + "completions/max_terminated_length": 1054.0, + "completions/mean_length": 473.77679443359375, + "completions/mean_terminated_length": 473.77679443359375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 1.2331699767861748, + "grad_norm": 0.663589596748352, + "kl": 0.0946044921875, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 138770775.0, + "reward": 1.354017972946167, + "reward_std": 0.16192704439163208, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35401788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.38368406891822815, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 485.40179443359375, + "completions/mean_terminated_length": 485.40179443359375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 1.2342017023471756, + "grad_norm": 0.6391304135322571, + "kl": 0.0919189453125, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 138890863.0, + "reward": 1.35535728931427, + "reward_std": 0.13343793153762817, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3553571403026581, + "rewards/curriculum_aware_reward_fn/std": 0.35624146461486816, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1354.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 476.0714416503906, + "completions/mean_terminated_length": 476.0714416503906, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 1.2352334279081765, + "grad_norm": 0.8422456979751587, + "kl": 0.0897216796875, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 139000687.0, + "reward": 1.3571429252624512, + "reward_std": 0.2340608388185501, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3660714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3382662832736969, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 464.294677734375, + "completions/mean_terminated_length": 464.294677734375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.2362651534691773, + "grad_norm": 0.7451651692390442, + "kl": 0.0931396484375, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 139119384.0, + "reward": 1.3245537281036377, + "reward_std": 0.27345478534698486, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.33348217606544495, + "rewards/curriculum_aware_reward_fn/std": 0.3578834533691406, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 491.4732360839844, + "completions/mean_terminated_length": 491.4732360839844, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 1.237296879030178, + "grad_norm": 0.7376614809036255, + "kl": 0.0958251953125, + "learning_rate": 1e-06, + "loss": -0.0344, + "num_tokens": 139246018.0, + "reward": 1.3772321939468384, + "reward_std": 0.22673317790031433, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.377232164144516, + "rewards/curriculum_aware_reward_fn/std": 0.4012867212295532, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 470.607177734375, + "completions/mean_terminated_length": 470.607177734375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.2383286045911788, + "grad_norm": 0.6506025791168213, + "kl": 0.0880126953125, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 139366850.0, + "reward": 1.3468750715255737, + "reward_std": 0.18790303170681, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.35580357909202576, + "rewards/curriculum_aware_reward_fn/std": 0.3913332521915436, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 450.732177734375, + "completions/mean_terminated_length": 450.732177734375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 1.2393603301521796, + "grad_norm": 0.7297881245613098, + "kl": 0.093505859375, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 139491259.0, + "reward": 1.4642857313156128, + "reward_std": 0.12518589198589325, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4642857015132904, + "rewards/curriculum_aware_reward_fn/std": 0.3733479082584381, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 479.58038330078125, + "completions/mean_terminated_length": 479.58038330078125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 1.2403920557131802, + "grad_norm": 0.7408363819122314, + "kl": 0.091064453125, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 139618738.0, + "reward": 1.3156250715255737, + "reward_std": 0.1732882559299469, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3386533856391907, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1396.0, + "completions/max_terminated_length": 1396.0, + "completions/mean_length": 521.4732666015625, + "completions/mean_terminated_length": 521.4732666015625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 1.241423781274181, + "grad_norm": 0.6051561236381531, + "kl": 0.0849609375, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 139739646.0, + "reward": 1.4366072416305542, + "reward_std": 0.20512039959430695, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3702053129673004, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 416.0357360839844, + "completions/mean_terminated_length": 416.0357360839844, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 1.242455506835182, + "grad_norm": 0.7176798582077026, + "kl": 0.0919189453125, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 139847583.0, + "reward": 1.4732143878936768, + "reward_std": 0.200529545545578, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4821428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3688829839229584, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1159.0, + "completions/max_terminated_length": 1159.0, + "completions/mean_length": 477.2410888671875, + "completions/mean_terminated_length": 477.2410888671875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.2434872323961825, + "grad_norm": 0.6013513207435608, + "kl": 0.09033203125, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 139974079.0, + "reward": 1.352678656578064, + "reward_std": 0.14784403145313263, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3526785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.37888169288635254, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1064.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 496.0982360839844, + "completions/mean_terminated_length": 496.0982360839844, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 1.2445189579571834, + "grad_norm": 0.6596572399139404, + "kl": 0.0899658203125, + "learning_rate": 1e-06, + "loss": -0.0175, + "num_tokens": 140102995.0, + "reward": 1.305803656578064, + "reward_std": 0.11023427546024323, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3058035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.3495650887489319, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 471.607177734375, + "completions/mean_terminated_length": 471.607177734375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 1.2455506835181842, + "grad_norm": 0.7289984226226807, + "kl": 0.0968017578125, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 140223778.0, + "reward": 1.3375000953674316, + "reward_std": 0.15725064277648926, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33750003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.3675766885280609, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 428.8750305175781, + "completions/mean_terminated_length": 428.8750305175781, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.2465824090791848, + "grad_norm": 0.7640532851219177, + "kl": 0.1015625, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 140337339.0, + "reward": 1.41785728931427, + "reward_std": 0.19161218404769897, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41785717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3349638283252716, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1253.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 467.169677734375, + "completions/mean_terminated_length": 467.169677734375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 1.2476141346401857, + "grad_norm": 0.7718006372451782, + "kl": 0.097412109375, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 140459179.0, + "reward": 1.3183037042617798, + "reward_std": 0.17381516098976135, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.32723215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.3610475957393646, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 514.2053833007812, + "completions/mean_terminated_length": 514.2053833007812, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 1.2486458602011865, + "grad_norm": 0.6831369996070862, + "kl": 0.0916748046875, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 140586602.0, + "reward": 1.288839340209961, + "reward_std": 0.21529847383499146, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28883931040763855, + "rewards/curriculum_aware_reward_fn/std": 0.3663620352745056, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1083.0, + "completions/max_terminated_length": 1083.0, + "completions/mean_length": 455.8482360839844, + "completions/mean_terminated_length": 455.8482360839844, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 1.2496775857621873, + "grad_norm": 0.749707818031311, + "kl": 0.087890625, + "learning_rate": 1e-06, + "loss": 0.0339, + "num_tokens": 140707727.0, + "reward": 1.3361608982086182, + "reward_std": 0.1401582509279251, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3828067183494568, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 473.7410888671875, + "completions/mean_terminated_length": 473.7410888671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.250709311323188, + "grad_norm": 0.679556667804718, + "kl": 0.0899658203125, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 140828455.0, + "reward": 1.4352679252624512, + "reward_std": 0.20762227475643158, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.37654417753219604, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 533.0178833007812, + "completions/mean_terminated_length": 533.0178833007812, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 1.2517410368841888, + "grad_norm": 0.7216713428497314, + "kl": 0.0875244140625, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 140962094.0, + "reward": 1.3258929252624512, + "reward_std": 0.18453404307365417, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3258928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3267352879047394, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 470.7232360839844, + "completions/mean_terminated_length": 470.7232360839844, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 1.2527727624451896, + "grad_norm": 0.6506356000900269, + "kl": 0.094482421875, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 141081994.0, + "reward": 1.2892858982086182, + "reward_std": 0.15539708733558655, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28928571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3660987913608551, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2342.0, + "completions/max_terminated_length": 2342.0, + "completions/mean_length": 522.357177734375, + "completions/mean_terminated_length": 522.357177734375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 1.2538044880061903, + "grad_norm": 0.6972716450691223, + "kl": 0.0882568359375, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 141211828.0, + "reward": 1.394196629524231, + "reward_std": 0.17642062902450562, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39419645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.3766702115535736, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1132.0, + "completions/max_terminated_length": 1132.0, + "completions/mean_length": 558.7767944335938, + "completions/mean_terminated_length": 558.7767944335938, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 1.254836213567191, + "grad_norm": 0.6484391689300537, + "kl": 0.0782470703125, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 141347953.0, + "reward": 1.3245537281036377, + "reward_std": 0.15901994705200195, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.33348211646080017, + "rewards/curriculum_aware_reward_fn/std": 0.32510486245155334, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1342.0, + "completions/max_terminated_length": 1342.0, + "completions/mean_length": 426.4732360839844, + "completions/mean_terminated_length": 426.4732360839844, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.255867939128192, + "grad_norm": 0.5722562074661255, + "kl": 0.1002197265625, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 141459446.0, + "reward": 1.4455357789993286, + "reward_std": 0.15726637840270996, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.45446428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.39952415227890015, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1195.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 458.4285888671875, + "completions/mean_terminated_length": 458.4285888671875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.2568996646891928, + "grad_norm": 0.8675462007522583, + "kl": 0.0947265625, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 141578242.0, + "reward": 1.4223216772079468, + "reward_std": 0.219200998544693, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42232146859169006, + "rewards/curriculum_aware_reward_fn/std": 0.3711947798728943, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 496.3660888671875, + "completions/mean_terminated_length": 496.3660888671875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 1.2579313902501934, + "grad_norm": 0.6168084144592285, + "kl": 0.088623046875, + "learning_rate": 1e-06, + "loss": -0.0305, + "num_tokens": 141699796.0, + "reward": 1.4183037281036377, + "reward_std": 0.18364112079143524, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41830357909202576, + "rewards/curriculum_aware_reward_fn/std": 0.38816937804222107, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1291.0, + "completions/max_terminated_length": 1291.0, + "completions/mean_length": 486.02679443359375, + "completions/mean_terminated_length": 486.02679443359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.2589631158111942, + "grad_norm": 0.7693423628807068, + "kl": 0.0953369140625, + "learning_rate": 1e-06, + "loss": 0.0296, + "num_tokens": 141823097.0, + "reward": 1.360267996788025, + "reward_std": 0.18344184756278992, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36026784777641296, + "rewards/curriculum_aware_reward_fn/std": 0.359375, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 452.0625305175781, + "completions/mean_terminated_length": 452.0625305175781, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.2599948413721949, + "grad_norm": 0.7499447464942932, + "kl": 0.103271484375, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 141945618.0, + "reward": 1.512946605682373, + "reward_std": 0.15902185440063477, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579, + "rewards/curriculum_aware_reward_fn/std": 0.469463050365448, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 440.4464416503906, + "completions/mean_terminated_length": 440.4464416503906, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 1.2610265669331957, + "grad_norm": 0.7502613663673401, + "kl": 0.1002197265625, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 142059187.0, + "reward": 1.47633957862854, + "reward_std": 0.2236849069595337, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47633931040763855, + "rewards/curriculum_aware_reward_fn/std": 0.36242181062698364, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1683.0, + "completions/max_terminated_length": 1683.0, + "completions/mean_length": 499.5714416503906, + "completions/mean_terminated_length": 499.5714416503906, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 1.2620582924941965, + "grad_norm": 0.7217331528663635, + "kl": 0.0931396484375, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 142181331.0, + "reward": 1.4142858982086182, + "reward_std": 0.20921562612056732, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41428571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.38193607330322266, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 482.5625305175781, + "completions/mean_terminated_length": 482.5625305175781, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 1.2630900180551974, + "grad_norm": 0.7930240035057068, + "kl": 0.095703125, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 142309739.0, + "reward": 1.2544643878936768, + "reward_std": 0.19663681089878082, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.323270320892334, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 468.1964416503906, + "completions/mean_terminated_length": 468.1964416503906, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 1.2641217436161982, + "grad_norm": 0.8367300033569336, + "kl": 0.103515625, + "learning_rate": 1e-06, + "loss": 0.0218, + "num_tokens": 142435817.0, + "reward": 1.4767858982086182, + "reward_std": 0.2238132655620575, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3726535439491272, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1131.0, + "completions/max_terminated_length": 1131.0, + "completions/mean_length": 522.6875, + "completions/mean_terminated_length": 522.6875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.2651534691771988, + "grad_norm": 0.7651290893554688, + "kl": 0.0888671875, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 142563679.0, + "reward": 1.3316963911056519, + "reward_std": 0.1838735193014145, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33169645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.3260461688041687, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 538.2142944335938, + "completions/mean_terminated_length": 538.2142944335938, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 1.2661851947381997, + "grad_norm": 0.7349340319633484, + "kl": 0.091796875, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 142701176.0, + "reward": 1.3214287757873535, + "reward_std": 0.23082859814167023, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3214285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.35097768902778625, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 471.2232360839844, + "completions/mean_terminated_length": 471.2232360839844, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 1.2672169202992003, + "grad_norm": 0.7736876010894775, + "kl": 0.10205078125, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 142821762.0, + "reward": 1.4781252145767212, + "reward_std": 0.15466152131557465, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4781250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3568998873233795, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 466.9285888671875, + "completions/mean_terminated_length": 466.9285888671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 1.2682486458602011, + "grad_norm": 0.6920589208602905, + "kl": 0.10595703125, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 142940460.0, + "reward": 1.3821431398391724, + "reward_std": 0.1856403797864914, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709, + "rewards/curriculum_aware_reward_fn/std": 0.3613573908805847, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 472.9285888671875, + "completions/mean_terminated_length": 472.9285888671875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 1.269280371421202, + "grad_norm": 0.7609573602676392, + "kl": 0.1005859375, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 143061762.0, + "reward": 1.394196629524231, + "reward_std": 0.17537376284599304, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39419645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.34613344073295593, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 481.7232360839844, + "completions/mean_terminated_length": 481.7232360839844, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 1.2703120969822028, + "grad_norm": 0.7466670870780945, + "kl": 0.0968017578125, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 143182726.0, + "reward": 1.4718750715255737, + "reward_std": 0.1868813931941986, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47187498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.36010393500328064, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1468.0, + "completions/max_terminated_length": 1468.0, + "completions/mean_length": 538.8660888671875, + "completions/mean_terminated_length": 538.8660888671875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 1.2713438225432034, + "grad_norm": 0.7122594714164734, + "kl": 0.097412109375, + "learning_rate": 1e-06, + "loss": 0.0168, + "num_tokens": 143315263.0, + "reward": 1.2933037281036377, + "reward_std": 0.19857461750507355, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29330357909202576, + "rewards/curriculum_aware_reward_fn/std": 0.3709918260574341, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1390.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 533.9553833007812, + "completions/mean_terminated_length": 533.9553833007812, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 1.2723755481042043, + "grad_norm": 0.6058053970336914, + "kl": 0.09716796875, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 143442343.0, + "reward": 1.3241071701049805, + "reward_std": 0.12093013525009155, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32410717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3290706276893616, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1076.0, + "completions/max_terminated_length": 1076.0, + "completions/mean_length": 463.2857360839844, + "completions/mean_terminated_length": 463.2857360839844, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.2734072736652051, + "grad_norm": 0.7048044204711914, + "kl": 0.0992431640625, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 143563230.0, + "reward": 1.4361608028411865, + "reward_std": 0.19449612498283386, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.3961503803730011, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1277.0, + "completions/max_terminated_length": 1277.0, + "completions/mean_length": 503.8125305175781, + "completions/mean_terminated_length": 503.8125305175781, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.2744389992262057, + "grad_norm": 0.7165812253952026, + "kl": 0.0982666015625, + "learning_rate": 1e-06, + "loss": 0.018, + "num_tokens": 143683203.0, + "reward": 1.3352679014205933, + "reward_std": 0.17653267085552216, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3352678716182709, + "rewards/curriculum_aware_reward_fn/std": 0.35789918899536133, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 555.8660888671875, + "completions/mean_terminated_length": 555.8660888671875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 1.2754707247872066, + "grad_norm": 0.6849974393844604, + "kl": 0.09130859375, + "learning_rate": 1e-06, + "loss": -0.0134, + "num_tokens": 143811036.0, + "reward": 1.2950893640518188, + "reward_std": 0.14985202252864838, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2950892746448517, + "rewards/curriculum_aware_reward_fn/std": 0.31615370512008667, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1254.0, + "completions/max_terminated_length": 1254.0, + "completions/mean_length": 580.9017944335938, + "completions/mean_terminated_length": 580.9017944335938, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 1.2765024503482074, + "grad_norm": 0.6134920120239258, + "kl": 0.0875244140625, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 143947000.0, + "reward": 1.3066965341567993, + "reward_std": 0.12845320999622345, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30669644474983215, + "rewards/curriculum_aware_reward_fn/std": 0.3435044288635254, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1131.0, + "completions/max_terminated_length": 1131.0, + "completions/mean_length": 469.8214416503906, + "completions/mean_terminated_length": 469.8214416503906, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.2775341759092083, + "grad_norm": 0.7334262132644653, + "kl": 0.0987548828125, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 144062697.0, + "reward": 1.4803574085235596, + "reward_std": 0.16172140836715698, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48035717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3639966547489166, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 500.0000305175781, + "completions/mean_terminated_length": 500.0000305175781, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 1.2785659014702089, + "grad_norm": 0.7100305557250977, + "kl": 0.1038818359375, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 144200359.0, + "reward": 1.3312500715255737, + "reward_std": 0.18355976045131683, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33124998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3559738099575043, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1698.0, + "completions/max_terminated_length": 1698.0, + "completions/mean_length": 493.08038330078125, + "completions/mean_terminated_length": 493.08038330078125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 1.2795976270312097, + "grad_norm": 0.7557586431503296, + "kl": 0.11279296875, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 144329063.0, + "reward": 1.416517972946167, + "reward_std": 0.1822463870048523, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41651788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3514697849750519, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 476.7232360839844, + "completions/mean_terminated_length": 476.7232360839844, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 1.2806293525922103, + "grad_norm": 0.7205198407173157, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 144451168.0, + "reward": 1.2892858982086182, + "reward_std": 0.14720794558525085, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28928571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.34297508001327515, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 450.5535888671875, + "completions/mean_terminated_length": 450.5535888671875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.2816610781532112, + "grad_norm": 0.712121844291687, + "kl": 0.1102294921875, + "learning_rate": 1e-06, + "loss": 0.0158, + "num_tokens": 144566814.0, + "reward": 1.3857144117355347, + "reward_std": 0.1151876151561737, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3857143223285675, + "rewards/curriculum_aware_reward_fn/std": 0.3716549575328827, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1576.0, + "completions/max_terminated_length": 1576.0, + "completions/mean_length": 524.2053833007812, + "completions/mean_terminated_length": 524.2053833007812, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 1.282692803714212, + "grad_norm": 0.7179633975028992, + "kl": 0.100341796875, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 144695784.0, + "reward": 1.3245537281036377, + "reward_std": 0.18321338295936584, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32455357909202576, + "rewards/curriculum_aware_reward_fn/std": 0.350546270608902, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1109.0, + "completions/max_terminated_length": 1109.0, + "completions/mean_length": 489.5982360839844, + "completions/mean_terminated_length": 489.5982360839844, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 1.2837245292752129, + "grad_norm": 0.6813969016075134, + "kl": 0.1053466796875, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 144831938.0, + "reward": 1.3406251668930054, + "reward_std": 0.15983448922634125, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.34955358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3607734441757202, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 472.6607360839844, + "completions/mean_terminated_length": 472.6607360839844, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 1.2847562548362137, + "grad_norm": 0.7200454473495483, + "kl": 0.0999755859375, + "learning_rate": 1e-06, + "loss": 0.0254, + "num_tokens": 144951068.0, + "reward": 1.3401787281036377, + "reward_std": 0.1479780077934265, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34017854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3724668025970459, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 528.5178833007812, + "completions/mean_terminated_length": 528.5178833007812, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.2857879803972143, + "grad_norm": 0.74029541015625, + "kl": 0.105712890625, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 145078841.0, + "reward": 1.2727679014205933, + "reward_std": 0.22440676391124725, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2727678716182709, + "rewards/curriculum_aware_reward_fn/std": 0.34880027174949646, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 998.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 457.6875305175781, + "completions/mean_terminated_length": 457.6875305175781, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 1.2868197059582152, + "grad_norm": 0.7149326205253601, + "kl": 0.1051025390625, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 145197015.0, + "reward": 1.376339316368103, + "reward_std": 0.16753913462162018, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37633928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3888050615787506, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 428.2946472167969, + "completions/mean_terminated_length": 428.2946472167969, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.2878514315192158, + "grad_norm": 0.6227552890777588, + "kl": 0.0963134765625, + "learning_rate": 1e-06, + "loss": -0.0127, + "num_tokens": 145303770.0, + "reward": 1.481696605682373, + "reward_std": 0.13157878816127777, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4816964268684387, + "rewards/curriculum_aware_reward_fn/std": 0.3921525180339813, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 469.3125305175781, + "completions/mean_terminated_length": 469.3125305175781, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 1.2888831570802166, + "grad_norm": 0.7807196378707886, + "kl": 0.095458984375, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 145429217.0, + "reward": 1.4250000715255737, + "reward_std": 0.20809811353683472, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3775215744972229, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 445.9732360839844, + "completions/mean_terminated_length": 445.9732360839844, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 1.2899148826412175, + "grad_norm": 0.8612940907478333, + "kl": 0.115966796875, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 145542145.0, + "reward": 1.3575894832611084, + "reward_std": 0.21384631097316742, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3665178716182709, + "rewards/curriculum_aware_reward_fn/std": 0.3716525435447693, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1211.0, + "completions/max_terminated_length": 1211.0, + "completions/mean_length": 463.5625305175781, + "completions/mean_terminated_length": 463.5625305175781, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 1.2909466082022183, + "grad_norm": 0.756700336933136, + "kl": 0.0977783203125, + "learning_rate": 1e-06, + "loss": 0.0401, + "num_tokens": 145656441.0, + "reward": 1.4415180683135986, + "reward_std": 0.21089234948158264, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4504464566707611, + "rewards/curriculum_aware_reward_fn/std": 0.3938431441783905, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 425.0089416503906, + "completions/mean_terminated_length": 425.0089416503906, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 1.291978333763219, + "grad_norm": 0.7736772298812866, + "kl": 0.100830078125, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 145768883.0, + "reward": 1.4053571224212646, + "reward_std": 0.10741391032934189, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41428571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3921199440956116, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 450.26788330078125, + "completions/mean_terminated_length": 450.26788330078125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.2930100593242198, + "grad_norm": 0.744135320186615, + "kl": 0.103271484375, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 145879464.0, + "reward": 1.3531252145767212, + "reward_std": 0.1737460494041443, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35312503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.37688153982162476, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 434.8125305175781, + "completions/mean_terminated_length": 434.8125305175781, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.2940417848852206, + "grad_norm": 0.680141270160675, + "kl": 0.1029052734375, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 146001166.0, + "reward": 1.2763392925262451, + "reward_std": 0.14037790894508362, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2763392925262451, + "rewards/curriculum_aware_reward_fn/std": 0.36791107058525085, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 414.51788330078125, + "completions/mean_terminated_length": 414.51788330078125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 1.2950735104462212, + "grad_norm": 0.6044324636459351, + "kl": 0.1021728515625, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 146109787.0, + "reward": 1.4133929014205933, + "reward_std": 0.11665144562721252, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4133928716182709, + "rewards/curriculum_aware_reward_fn/std": 0.3988712728023529, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 460.6875305175781, + "completions/mean_terminated_length": 460.6875305175781, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.296105236007222, + "grad_norm": 0.7419890761375427, + "kl": 0.09716796875, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 146228493.0, + "reward": 1.2852680683135986, + "reward_std": 0.1637239307165146, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2852678596973419, + "rewards/curriculum_aware_reward_fn/std": 0.3315020799636841, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1421.0, + "completions/max_terminated_length": 1421.0, + "completions/mean_length": 530.1964721679688, + "completions/mean_terminated_length": 530.1964721679688, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 1.297136961568223, + "grad_norm": 0.6371442675590515, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 146357812.0, + "reward": 1.391517996788025, + "reward_std": 0.163621187210083, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39151787757873535, + "rewards/curriculum_aware_reward_fn/std": 0.3548815846443176, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 434.0357360839844, + "completions/mean_terminated_length": 434.0357360839844, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 1.2981686871292237, + "grad_norm": 0.7808496952056885, + "kl": 0.102783203125, + "learning_rate": 1e-06, + "loss": -0.0118, + "num_tokens": 146469809.0, + "reward": 1.4352679252624512, + "reward_std": 0.21879854798316956, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4352678954601288, + "rewards/curriculum_aware_reward_fn/std": 0.4187948405742645, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 463.232177734375, + "completions/mean_terminated_length": 463.232177734375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.2992004126902243, + "grad_norm": 0.660305917263031, + "kl": 0.115478515625, + "learning_rate": 1e-06, + "loss": 0.0152, + "num_tokens": 146599006.0, + "reward": 1.3049108982086182, + "reward_std": 0.13474413752555847, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.31383928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3810375928878784, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 466.794677734375, + "completions/mean_terminated_length": 466.794677734375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 1.3002321382512252, + "grad_norm": 0.7298482656478882, + "kl": 0.1090087890625, + "learning_rate": 1e-06, + "loss": -0.0119, + "num_tokens": 146721819.0, + "reward": 1.383928656578064, + "reward_std": 0.18910807371139526, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3839285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.47125548124313354, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 467.2410888671875, + "completions/mean_terminated_length": 467.2410888671875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.301263863812226, + "grad_norm": 0.7236865758895874, + "kl": 0.09375, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 146840370.0, + "reward": 1.3031251430511475, + "reward_std": 0.18334250152111053, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3031249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3641156852245331, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 411.4375305175781, + "completions/mean_terminated_length": 411.4375305175781, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 1.3022955893732266, + "grad_norm": 0.8327616453170776, + "kl": 0.1168212890625, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 146957991.0, + "reward": 1.3495537042617798, + "reward_std": 0.17797954380512238, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34955358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3442249298095703, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 479.1607360839844, + "completions/mean_terminated_length": 479.1607360839844, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 1.3033273149342275, + "grad_norm": 0.6464178562164307, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 147089532.0, + "reward": 1.341071605682373, + "reward_std": 0.21358288824558258, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3410714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.38020941615104675, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1121.0, + "completions/max_terminated_length": 1121.0, + "completions/mean_length": 440.26788330078125, + "completions/mean_terminated_length": 440.26788330078125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.3043590404952283, + "grad_norm": 0.830440878868103, + "kl": 0.1002197265625, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 147208030.0, + "reward": 1.49598228931427, + "reward_std": 0.17758455872535706, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49598217010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3642923831939697, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 464.58038330078125, + "completions/mean_terminated_length": 464.58038330078125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.3053907660562292, + "grad_norm": 0.7556714415550232, + "kl": 0.1107177734375, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 147333395.0, + "reward": 1.3482143878936768, + "reward_std": 0.18016089498996735, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3571428954601288, + "rewards/curriculum_aware_reward_fn/std": 0.3552601933479309, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 444.0000305175781, + "completions/mean_terminated_length": 444.0000305175781, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.3064224916172298, + "grad_norm": 0.7321702837944031, + "kl": 0.107177734375, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 147453074.0, + "reward": 1.2883929014205933, + "reward_std": 0.1664956659078598, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2883928716182709, + "rewards/curriculum_aware_reward_fn/std": 0.35526806116104126, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 977.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 484.77679443359375, + "completions/mean_terminated_length": 484.77679443359375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.3074542171782306, + "grad_norm": 0.7560392022132874, + "kl": 0.09521484375, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 147577327.0, + "reward": 1.383928656578064, + "reward_std": 0.150946244597435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3839285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.36287233233451843, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 443.232177734375, + "completions/mean_terminated_length": 443.232177734375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.3084859427392312, + "grad_norm": 0.7266552448272705, + "kl": 0.1044921875, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 147688625.0, + "reward": 1.3571429252624512, + "reward_std": 0.16170468926429749, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.35923197865486145, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 477.5089416503906, + "completions/mean_terminated_length": 477.5089416503906, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.309517668300232, + "grad_norm": 0.9126338958740234, + "kl": 0.100830078125, + "learning_rate": 1e-06, + "loss": -0.0169, + "num_tokens": 147816593.0, + "reward": 1.3531252145767212, + "reward_std": 0.22347509860992432, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35312503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.35623666644096375, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 497.39288330078125, + "completions/mean_terminated_length": 497.39288330078125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 1.310549393861233, + "grad_norm": 0.7279953956604004, + "kl": 0.1009521484375, + "learning_rate": 1e-06, + "loss": 0.0142, + "num_tokens": 147941216.0, + "reward": 1.3294644355773926, + "reward_std": 0.1885773241519928, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.33839288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.33785107731819153, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1239.0, + "completions/max_terminated_length": 1239.0, + "completions/mean_length": 463.5714416503906, + "completions/mean_terminated_length": 463.5714416503906, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.3115811194222338, + "grad_norm": 0.6684086322784424, + "kl": 0.1041259765625, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 148058194.0, + "reward": 1.4455358982086182, + "reward_std": 0.15577620267868042, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.395729124546051, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 494.96429443359375, + "completions/mean_terminated_length": 494.96429443359375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.3126128449832344, + "grad_norm": 0.7396094799041748, + "kl": 0.09814453125, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 148179325.0, + "reward": 1.4459823369979858, + "reward_std": 0.15884001553058624, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4459821879863739, + "rewards/curriculum_aware_reward_fn/std": 0.37567588686943054, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 475.3750305175781, + "completions/mean_terminated_length": 475.3750305175781, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 1.3136445705442352, + "grad_norm": 0.6397110223770142, + "kl": 0.1038818359375, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 148296400.0, + "reward": 1.3174108266830444, + "reward_std": 0.1539250612258911, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31741073727607727, + "rewards/curriculum_aware_reward_fn/std": 0.3694840669631958, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1061.0, + "completions/max_terminated_length": 1061.0, + "completions/mean_length": 471.8839416503906, + "completions/mean_terminated_length": 471.8839416503906, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 1.314676296105236, + "grad_norm": 0.7454695105552673, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 148415576.0, + "reward": 1.4093750715255737, + "reward_std": 0.15189561247825623, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40937501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.37299203872680664, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 460.95538330078125, + "completions/mean_terminated_length": 460.95538330078125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 1.3157080216662367, + "grad_norm": 0.821291446685791, + "kl": 0.112548828125, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 148525149.0, + "reward": 1.3482143878936768, + "reward_std": 0.18040023744106293, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.35967057943344116, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 428.7946472167969, + "completions/mean_terminated_length": 428.7946472167969, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 1.3167397472272375, + "grad_norm": 0.6388067007064819, + "kl": 0.1075439453125, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 148635502.0, + "reward": 1.252678632736206, + "reward_std": 0.12519048154354095, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25267860293388367, + "rewards/curriculum_aware_reward_fn/std": 0.33408740162849426, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 510.9910888671875, + "completions/mean_terminated_length": 510.9910888671875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.3177714727882384, + "grad_norm": 0.7216881513595581, + "kl": 0.093017578125, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 148759681.0, + "reward": 1.368303656578064, + "reward_std": 0.17615720629692078, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3683035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.3455636203289032, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 467.982177734375, + "completions/mean_terminated_length": 467.982177734375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.3188031983492392, + "grad_norm": 0.7484327554702759, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 148878686.0, + "reward": 1.4361608028411865, + "reward_std": 0.16896887123584747, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43616074323654175, + "rewards/curriculum_aware_reward_fn/std": 0.36243733763694763, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 487.6339416503906, + "completions/mean_terminated_length": 487.6339416503906, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 1.3198349239102398, + "grad_norm": 0.8039454221725464, + "kl": 0.0941162109375, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 149004316.0, + "reward": 1.2625000476837158, + "reward_std": 0.20210020244121552, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.27142855525016785, + "rewards/curriculum_aware_reward_fn/std": 0.34684643149375916, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 504.482177734375, + "completions/mean_terminated_length": 504.482177734375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 1.3208666494712407, + "grad_norm": 0.7807563543319702, + "kl": 0.0994873046875, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 149136866.0, + "reward": 1.3790180683135986, + "reward_std": 0.14714425802230835, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3790178596973419, + "rewards/curriculum_aware_reward_fn/std": 0.36096954345703125, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 489.857177734375, + "completions/mean_terminated_length": 489.857177734375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 1.3218983750322415, + "grad_norm": 0.7423040866851807, + "kl": 0.1202392578125, + "learning_rate": 1e-06, + "loss": -0.0131, + "num_tokens": 149258818.0, + "reward": 1.2906250953674316, + "reward_std": 0.13313591480255127, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29062503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.3379543125629425, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 459.3482360839844, + "completions/mean_terminated_length": 459.3482360839844, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.3229301005932421, + "grad_norm": 0.6583361625671387, + "kl": 0.1065673828125, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 149381917.0, + "reward": 1.5464287996292114, + "reward_std": 0.14643734693527222, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5553571581840515, + "rewards/curriculum_aware_reward_fn/std": 0.4766794741153717, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 461.607177734375, + "completions/mean_terminated_length": 461.607177734375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.323961826154243, + "grad_norm": 0.7041372656822205, + "kl": 0.1014404296875, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 149501832.0, + "reward": 1.3616071939468384, + "reward_std": 0.14573663473129272, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3616071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3674223721027374, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 456.83038330078125, + "completions/mean_terminated_length": 456.83038330078125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 1.3249935517152438, + "grad_norm": 0.8035464882850647, + "kl": 0.106689453125, + "learning_rate": 1e-06, + "loss": 0.0209, + "num_tokens": 149624103.0, + "reward": 1.3138394355773926, + "reward_std": 0.1782553344964981, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31383928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.36712971329689026, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1754.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 497.27679443359375, + "completions/mean_terminated_length": 497.27679443359375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 1.3260252772762446, + "grad_norm": 0.7910950779914856, + "kl": 0.104248046875, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 149746137.0, + "reward": 1.4767857789993286, + "reward_std": 0.20006127655506134, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.38636261224746704, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 486.9107360839844, + "completions/mean_terminated_length": 486.9107360839844, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 1.3270570028372453, + "grad_norm": 0.7017124891281128, + "kl": 0.09912109375, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 149873590.0, + "reward": 1.3361608982086182, + "reward_std": 0.19810323417186737, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.37405797839164734, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 479.70538330078125, + "completions/mean_terminated_length": 479.70538330078125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 1.328088728398246, + "grad_norm": 0.7664874792098999, + "kl": 0.115478515625, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 149994230.0, + "reward": 1.247321605682373, + "reward_std": 0.1274891495704651, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24732144176959991, + "rewards/curriculum_aware_reward_fn/std": 0.3254822790622711, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 475.7410888671875, + "completions/mean_terminated_length": 475.7410888671875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.3291204539592467, + "grad_norm": 0.6920973062515259, + "kl": 0.1043701171875, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 150116085.0, + "reward": 1.2580358982086182, + "reward_std": 0.18575353920459747, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25803571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3629554510116577, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 479.4732360839844, + "completions/mean_terminated_length": 479.4732360839844, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.3301521795202476, + "grad_norm": 0.7353091239929199, + "kl": 0.10546875, + "learning_rate": 1e-06, + "loss": 0.0163, + "num_tokens": 150236993.0, + "reward": 1.411607265472412, + "reward_std": 0.15262123942375183, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41160717606544495, + "rewards/curriculum_aware_reward_fn/std": 0.34129998087882996, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 435.1696472167969, + "completions/mean_terminated_length": 435.1696472167969, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 1.3311839050812484, + "grad_norm": 0.8722867965698242, + "kl": 0.120849609375, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 150353093.0, + "reward": 1.4982144832611084, + "reward_std": 0.22456486523151398, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3938106894493103, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1175.0, + "completions/max_terminated_length": 1175.0, + "completions/mean_length": 529.232177734375, + "completions/mean_terminated_length": 529.232177734375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.3322156306422492, + "grad_norm": 0.7795112729072571, + "kl": 0.104248046875, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 150481030.0, + "reward": 1.3160717487335205, + "reward_std": 0.19670888781547546, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31607145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.3604435920715332, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 482.20538330078125, + "completions/mean_terminated_length": 482.20538330078125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.3332473562032499, + "grad_norm": 0.9019424915313721, + "kl": 0.1171875, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 150599815.0, + "reward": 1.4316965341567993, + "reward_std": 0.195232093334198, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43169641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.37156811356544495, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 448.6607360839844, + "completions/mean_terminated_length": 448.6607360839844, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 1.3342790817642507, + "grad_norm": 0.7510018944740295, + "kl": 0.1064453125, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 150720086.0, + "reward": 1.4589285850524902, + "reward_std": 0.16231584548950195, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4589286148548126, + "rewards/curriculum_aware_reward_fn/std": 0.3895720839500427, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1511.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 477.8035888671875, + "completions/mean_terminated_length": 477.8035888671875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 1.3353108073252515, + "grad_norm": 0.8627240061759949, + "kl": 0.1124267578125, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 150837174.0, + "reward": 1.305803656578064, + "reward_std": 0.21191909909248352, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3058035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.34227287769317627, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 456.26788330078125, + "completions/mean_terminated_length": 456.26788330078125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.3363425328862522, + "grad_norm": 0.7269176840782166, + "kl": 0.0921630859375, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 150953355.0, + "reward": 1.4727680683135986, + "reward_std": 0.15642473101615906, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4727678596973419, + "rewards/curriculum_aware_reward_fn/std": 0.39654409885406494, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3380.0, + "completions/max_terminated_length": 3380.0, + "completions/mean_length": 513.0982666015625, + "completions/mean_terminated_length": 513.0982666015625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.337374258447253, + "grad_norm": 0.8845184445381165, + "kl": 0.1019287109375, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 151073815.0, + "reward": 1.3383928537368774, + "reward_std": 0.23095576465129852, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33839288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3643445670604706, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 451.6964416503906, + "completions/mean_terminated_length": 451.6964416503906, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 1.3384059840082538, + "grad_norm": 0.6694154739379883, + "kl": 0.103759765625, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 151188469.0, + "reward": 1.4058037996292114, + "reward_std": 0.14767718315124512, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4058035910129547, + "rewards/curriculum_aware_reward_fn/std": 0.393399715423584, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1242.0, + "completions/max_terminated_length": 1242.0, + "completions/mean_length": 481.83929443359375, + "completions/mean_terminated_length": 481.83929443359375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.3394377095692547, + "grad_norm": 0.6722558736801147, + "kl": 0.1217041015625, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 151309159.0, + "reward": 1.244642972946167, + "reward_std": 0.12828920781612396, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.25357145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.35110601782798767, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1044.0, + "completions/max_terminated_length": 1044.0, + "completions/mean_length": 483.9732360839844, + "completions/mean_terminated_length": 483.9732360839844, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 1.3404694351302553, + "grad_norm": 0.6597891449928284, + "kl": 0.10498046875, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 151437193.0, + "reward": 1.2696430683135986, + "reward_std": 0.10995526611804962, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2696428596973419, + "rewards/curriculum_aware_reward_fn/std": 0.3576926290988922, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 489.1785888671875, + "completions/mean_terminated_length": 489.1785888671875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.3415011606912561, + "grad_norm": 0.797222375869751, + "kl": 0.1046142578125, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 151562002.0, + "reward": 1.3330358266830444, + "reward_std": 0.2373420149087906, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3330357074737549, + "rewards/curriculum_aware_reward_fn/std": 0.3669053614139557, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 487.14288330078125, + "completions/mean_terminated_length": 487.14288330078125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 1.342532886252257, + "grad_norm": 0.6237560510635376, + "kl": 0.1055908203125, + "learning_rate": 1e-06, + "loss": -0.0111, + "num_tokens": 151688235.0, + "reward": 1.2852680683135986, + "reward_std": 0.11172077059745789, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2852678596973419, + "rewards/curriculum_aware_reward_fn/std": 0.34910687804222107, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1502.0, + "completions/max_terminated_length": 1502.0, + "completions/mean_length": 582.1517944335938, + "completions/mean_terminated_length": 582.1517944335938, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.3435646118132576, + "grad_norm": 0.7618681192398071, + "kl": 0.1015625, + "learning_rate": 1e-06, + "loss": 0.0461, + "num_tokens": 151826656.0, + "reward": 1.3089287281036377, + "reward_std": 0.21061131358146667, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30892854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3483690619468689, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1174.0, + "completions/max_terminated_length": 1174.0, + "completions/mean_length": 497.8839416503906, + "completions/mean_terminated_length": 497.8839416503906, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 1.3445963373742584, + "grad_norm": 0.6274468898773193, + "kl": 0.1064453125, + "learning_rate": 1e-06, + "loss": 0.0267, + "num_tokens": 151958187.0, + "reward": 1.3544644117355347, + "reward_std": 0.139055073261261, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3544642925262451, + "rewards/curriculum_aware_reward_fn/std": 0.458530455827713, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1093.0, + "completions/max_terminated_length": 1093.0, + "completions/mean_length": 448.7500305175781, + "completions/mean_terminated_length": 448.7500305175781, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 1.3456280629352593, + "grad_norm": 0.6114272475242615, + "kl": 0.09912109375, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 152069830.0, + "reward": 1.3660715818405151, + "reward_std": 0.10887427628040314, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3660714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3791416585445404, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1100.0, + "completions/max_terminated_length": 1100.0, + "completions/mean_length": 453.52679443359375, + "completions/mean_terminated_length": 453.52679443359375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 1.3466597884962601, + "grad_norm": 0.6221665143966675, + "kl": 0.108154296875, + "learning_rate": 1e-06, + "loss": -0.0138, + "num_tokens": 152193285.0, + "reward": 1.4495537281036377, + "reward_std": 0.13579247891902924, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44955357909202576, + "rewards/curriculum_aware_reward_fn/std": 0.3779684603214264, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2482.0, + "completions/max_terminated_length": 2482.0, + "completions/mean_length": 480.6875305175781, + "completions/mean_terminated_length": 480.6875305175781, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 1.3476915140572607, + "grad_norm": 0.5587440729141235, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 152313748.0, + "reward": 1.4629465341567993, + "reward_std": 0.15617386996746063, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46294644474983215, + "rewards/curriculum_aware_reward_fn/std": 0.3960244953632355, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 517.6964721679688, + "completions/mean_terminated_length": 485.45947265625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.3487232396182616, + "grad_norm": 0.8069753050804138, + "kl": 0.1019287109375, + "learning_rate": 1e-06, + "loss": 0.0397, + "num_tokens": 152449374.0, + "reward": 1.364285945892334, + "reward_std": 0.24357934296131134, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709, + "rewards/curriculum_aware_reward_fn/std": 0.3882773816585541, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 461.08929443359375, + "completions/mean_terminated_length": 461.08929443359375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 1.3497549651792622, + "grad_norm": 0.8312681317329407, + "kl": 0.0958251953125, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 152566480.0, + "reward": 1.3772321939468384, + "reward_std": 0.23976361751556396, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3772321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3572220504283905, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1349.0, + "completions/max_terminated_length": 1349.0, + "completions/mean_length": 493.3214416503906, + "completions/mean_terminated_length": 493.3214416503906, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 1.350786690740263, + "grad_norm": 0.600620448589325, + "kl": 0.102294921875, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 152701007.0, + "reward": 1.3272322416305542, + "reward_std": 0.1470688432455063, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.35235437750816345, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1064.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 469.58038330078125, + "completions/mean_terminated_length": 469.58038330078125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 1.3518184163012639, + "grad_norm": 0.742341935634613, + "kl": 0.1064453125, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 152823208.0, + "reward": 1.5727680921554565, + "reward_std": 0.16378778219223022, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5727678537368774, + "rewards/curriculum_aware_reward_fn/std": 0.34300529956817627, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 480.3125305175781, + "completions/mean_terminated_length": 480.3125305175781, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 1.3528501418622647, + "grad_norm": 0.7505616545677185, + "kl": 0.106201171875, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 152942537.0, + "reward": 1.2919644117355347, + "reward_std": 0.14375154674053192, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2919642925262451, + "rewards/curriculum_aware_reward_fn/std": 0.3463161289691925, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 489.8482360839844, + "completions/mean_terminated_length": 489.8482360839844, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 1.3538818674232653, + "grad_norm": 0.7036014795303345, + "kl": 0.0926513671875, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 153058076.0, + "reward": 1.3107143640518188, + "reward_std": 0.13862745463848114, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3107142746448517, + "rewards/curriculum_aware_reward_fn/std": 0.37677082419395447, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 491.419677734375, + "completions/mean_terminated_length": 491.419677734375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 1.3549135929842662, + "grad_norm": 0.6712673902511597, + "kl": 0.0941162109375, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 153177459.0, + "reward": 1.2339286804199219, + "reward_std": 0.15220916271209717, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.24285714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.34379029273986816, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1090.0, + "completions/max_terminated_length": 1090.0, + "completions/mean_length": 460.3839416503906, + "completions/mean_terminated_length": 460.3839416503906, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.355945318545267, + "grad_norm": 0.7762075066566467, + "kl": 0.1126708984375, + "learning_rate": 1e-06, + "loss": -0.0224, + "num_tokens": 153296243.0, + "reward": 1.4513394832611084, + "reward_std": 0.17631617188453674, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4513393044471741, + "rewards/curriculum_aware_reward_fn/std": 0.38034242391586304, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 469.1250305175781, + "completions/mean_terminated_length": 469.1250305175781, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 1.3569770441062676, + "grad_norm": 0.690093457698822, + "kl": 0.103515625, + "learning_rate": 1e-06, + "loss": -0.0279, + "num_tokens": 153412681.0, + "reward": 1.372321605682373, + "reward_std": 0.23327423632144928, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3812500536441803, + "rewards/curriculum_aware_reward_fn/std": 0.40935707092285156, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1092.0, + "completions/max_terminated_length": 1092.0, + "completions/mean_length": 470.5714416503906, + "completions/mean_terminated_length": 470.5714416503906, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 1.3580087696672685, + "grad_norm": 0.7823927998542786, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 153528133.0, + "reward": 1.4848215579986572, + "reward_std": 0.2249300479888916, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.49375003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.44782379269599915, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1485.0, + "completions/max_terminated_length": 1485.0, + "completions/mean_length": 515.232177734375, + "completions/mean_terminated_length": 515.232177734375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 1.3590404952282693, + "grad_norm": 0.5873052477836609, + "kl": 0.0999755859375, + "learning_rate": 1e-06, + "loss": -0.014, + "num_tokens": 153646361.0, + "reward": 1.317857265472412, + "reward_std": 0.11958328634500504, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31785717606544495, + "rewards/curriculum_aware_reward_fn/std": 0.43799594044685364, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1250.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 527.4375, + "completions/mean_terminated_length": 527.4375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 1.3600722207892701, + "grad_norm": 2.275315046310425, + "kl": 0.24658203125, + "learning_rate": 1e-06, + "loss": 0.0292, + "num_tokens": 153782609.0, + "reward": 1.2450894117355347, + "reward_std": 0.20867618918418884, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2540178894996643, + "rewards/curriculum_aware_reward_fn/std": 0.322304368019104, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 478.482177734375, + "completions/mean_terminated_length": 478.482177734375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 1.3611039463502708, + "grad_norm": 0.6259239315986633, + "kl": 0.09228515625, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 153895538.0, + "reward": 1.4031251668930054, + "reward_std": 0.17404134571552277, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3928554654121399, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1701.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 481.4107360839844, + "completions/mean_terminated_length": 481.4107360839844, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 1.3621356719112716, + "grad_norm": 0.7713581919670105, + "kl": 0.10888671875, + "learning_rate": 1e-06, + "loss": 0.0174, + "num_tokens": 154021374.0, + "reward": 1.4316965341567993, + "reward_std": 0.17943032085895538, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43169644474983215, + "rewards/curriculum_aware_reward_fn/std": 0.3492588996887207, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 429.2589416503906, + "completions/mean_terminated_length": 429.2589416503906, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.3631673974722724, + "grad_norm": 0.777759850025177, + "kl": 0.1014404296875, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 154128706.0, + "reward": 1.512946605682373, + "reward_std": 0.2099122405052185, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579, + "rewards/curriculum_aware_reward_fn/std": 0.38699474930763245, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3884.0, + "completions/max_terminated_length": 3884.0, + "completions/mean_length": 487.7410888671875, + "completions/mean_terminated_length": 487.7410888671875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.364199123033273, + "grad_norm": 0.6767941117286682, + "kl": 0.09033203125, + "learning_rate": 1e-06, + "loss": 0.0382, + "num_tokens": 154249462.0, + "reward": 1.4066966772079468, + "reward_std": 0.177678644657135, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.43348217010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3961747884750366, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 474.1160888671875, + "completions/mean_terminated_length": 474.1160888671875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.365230848594274, + "grad_norm": 0.7261276841163635, + "kl": 0.1041259765625, + "learning_rate": 1e-06, + "loss": 0.0182, + "num_tokens": 154372535.0, + "reward": 1.375892996788025, + "reward_std": 0.20264343917369843, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.40267854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.35220280289649963, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 462.6250305175781, + "completions/mean_terminated_length": 462.6250305175781, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 1.3662625741552747, + "grad_norm": 0.7062304019927979, + "kl": 0.1004638671875, + "learning_rate": 1e-06, + "loss": 0.0282, + "num_tokens": 154494279.0, + "reward": 1.3062500953674316, + "reward_std": 0.1957845240831375, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.33303573727607727, + "rewards/curriculum_aware_reward_fn/std": 0.37160196900367737, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 1075.0, + "completions/mean_length": 476.1964416503906, + "completions/mean_terminated_length": 476.1964416503906, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 1.3672942997162756, + "grad_norm": 0.7053699493408203, + "kl": 0.1025390625, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 154612811.0, + "reward": 1.2950894832611084, + "reward_std": 0.1937311738729477, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.32187503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.3728863000869751, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 430.9464416503906, + "completions/mean_terminated_length": 430.9464416503906, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 1.3683260252772762, + "grad_norm": 0.7252833843231201, + "kl": 0.1053466796875, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 154728347.0, + "reward": 1.4075894355773926, + "reward_std": 0.22101444005966187, + "rewards/code_format_reward/mean": 0.9553571343421936, + "rewards/code_format_reward/std": 0.2074466347694397, + "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.37820035219192505, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 480.58929443359375, + "completions/mean_terminated_length": 480.58929443359375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.369357750838277, + "grad_norm": 0.7784614562988281, + "kl": 0.095703125, + "learning_rate": 1e-06, + "loss": 0.051, + "num_tokens": 154848064.0, + "reward": 1.3196429014205933, + "reward_std": 0.24868425726890564, + "rewards/code_format_reward/mean": 0.9642857313156128, + "rewards/code_format_reward/std": 0.18641091883182526, + "rewards/curriculum_aware_reward_fn/mean": 0.3553571403026581, + "rewards/curriculum_aware_reward_fn/std": 0.3615131974220276, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1087.0, + "completions/max_terminated_length": 1087.0, + "completions/mean_length": 450.1875305175781, + "completions/mean_terminated_length": 450.1875305175781, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.3703894763992777, + "grad_norm": 0.6946715116500854, + "kl": 0.09619140625, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 154974022.0, + "reward": 1.3125, + "reward_std": 0.17005577683448792, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.3303571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3713475167751312, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 422.77679443359375, + "completions/mean_terminated_length": 422.77679443359375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 1.3714212019602785, + "grad_norm": 0.8444381356239319, + "kl": 0.0989990234375, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 155080432.0, + "reward": 1.4718753099441528, + "reward_std": 0.23361340165138245, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47187498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3599788248538971, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 485.7589416503906, + "completions/mean_terminated_length": 485.7589416503906, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.3724529275212793, + "grad_norm": 0.7258499264717102, + "kl": 0.100830078125, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 155199545.0, + "reward": 1.2245535850524902, + "reward_std": 0.2251826673746109, + "rewards/code_format_reward/mean": 0.9464285969734192, + "rewards/code_format_reward/std": 0.2261819988489151, + "rewards/curriculum_aware_reward_fn/mean": 0.27812498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.3506609797477722, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 422.02679443359375, + "completions/mean_terminated_length": 422.02679443359375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 1.3734846530822802, + "grad_norm": 0.8235486745834351, + "kl": 0.093994140625, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 155307754.0, + "reward": 1.4330357313156128, + "reward_std": 0.22114646434783936, + "rewards/code_format_reward/mean": 0.9642857313156128, + "rewards/code_format_reward/std": 0.18641091883182526, + "rewards/curriculum_aware_reward_fn/mean": 0.46875, + "rewards/curriculum_aware_reward_fn/std": 0.3736914098262787, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1188.0, + "completions/max_terminated_length": 1188.0, + "completions/mean_length": 413.3482360839844, + "completions/mean_terminated_length": 413.3482360839844, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 1.374516378643281, + "grad_norm": 0.7121307253837585, + "kl": 0.0933837890625, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 155420871.0, + "reward": 1.364732265472412, + "reward_std": 0.15526407957077026, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.38258931040763855, + "rewards/curriculum_aware_reward_fn/std": 0.3583439290523529, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 467.0982360839844, + "completions/mean_terminated_length": 467.0982360839844, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 1.3755481042042816, + "grad_norm": 0.756607711315155, + "kl": 0.0885009765625, + "learning_rate": 1e-06, + "loss": 0.0375, + "num_tokens": 155544243.0, + "reward": 1.3258929252624512, + "reward_std": 0.21575960516929626, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.3526785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.37595757842063904, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 451.20538330078125, + "completions/mean_terminated_length": 451.20538330078125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 1.3765798297652825, + "grad_norm": 0.8574563264846802, + "kl": 0.0914306640625, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 155663308.0, + "reward": 1.3325893878936768, + "reward_std": 0.1875346153974533, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3415178656578064, + "rewards/curriculum_aware_reward_fn/std": 0.36291417479515076, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 474.2857360839844, + "completions/mean_terminated_length": 474.2857360839844, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.377611555326283, + "grad_norm": 0.6797184348106384, + "kl": 0.0853271484375, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 155783103.0, + "reward": 1.41785728931427, + "reward_std": 0.1773865520954132, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.42678573727607727, + "rewards/curriculum_aware_reward_fn/std": 0.37034979462623596, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 488.8660888671875, + "completions/mean_terminated_length": 488.8660888671875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 1.378643280887284, + "grad_norm": 0.7231094241142273, + "kl": 0.08642578125, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 155905203.0, + "reward": 1.2906250953674316, + "reward_std": 0.19704152643680573, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2995535731315613, + "rewards/curriculum_aware_reward_fn/std": 0.37062740325927734, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1065.0, + "completions/max_terminated_length": 1065.0, + "completions/mean_length": 455.8482360839844, + "completions/mean_terminated_length": 455.8482360839844, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.3796750064482848, + "grad_norm": 0.7511847019195557, + "kl": 0.087646484375, + "learning_rate": 1e-06, + "loss": -0.0121, + "num_tokens": 156016675.0, + "reward": 1.3816964626312256, + "reward_std": 0.12826259434223175, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3816964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3298235833644867, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 434.6160888671875, + "completions/mean_terminated_length": 434.6160888671875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 1.3807067320092856, + "grad_norm": 0.8543606996536255, + "kl": 0.099609375, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 156130151.0, + "reward": 1.4160715341567993, + "reward_std": 0.21647287905216217, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41607144474983215, + "rewards/curriculum_aware_reward_fn/std": 0.3719448447227478, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 414.26788330078125, + "completions/mean_terminated_length": 414.26788330078125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 1.3817384575702862, + "grad_norm": 0.6648277640342712, + "kl": 0.1080322265625, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 156242916.0, + "reward": 1.3767858743667603, + "reward_std": 0.13667474687099457, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3857143223285675, + "rewards/curriculum_aware_reward_fn/std": 0.3733479380607605, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 477.77679443359375, + "completions/mean_terminated_length": 477.77679443359375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.382770183131287, + "grad_norm": 0.825429379940033, + "kl": 0.1053466796875, + "learning_rate": 1e-06, + "loss": 0.0274, + "num_tokens": 156362318.0, + "reward": 1.3816965818405151, + "reward_std": 0.21259619295597076, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3816964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.35336196422576904, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 447.26788330078125, + "completions/mean_terminated_length": 447.26788330078125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.383801908692288, + "grad_norm": 0.601068913936615, + "kl": 0.0947265625, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 156476644.0, + "reward": 1.2200894355773926, + "reward_std": 0.0771571695804596, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22008930146694183, + "rewards/curriculum_aware_reward_fn/std": 0.2974713146686554, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 388.5357360839844, + "completions/mean_terminated_length": 388.5357360839844, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 1.3848336342532885, + "grad_norm": 0.8562108278274536, + "kl": 0.10888671875, + "learning_rate": 1e-06, + "loss": 0.0165, + "num_tokens": 156588501.0, + "reward": 1.4767860174179077, + "reward_std": 0.20885738730430603, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3956366181373596, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 389.3125305175781, + "completions/mean_terminated_length": 389.3125305175781, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 1.3858653598142894, + "grad_norm": 0.7280513048171997, + "kl": 0.1011962890625, + "learning_rate": 1e-06, + "loss": -0.0069, + "num_tokens": 156687411.0, + "reward": 1.4861608743667603, + "reward_std": 0.10716202110052109, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4861607551574707, + "rewards/curriculum_aware_reward_fn/std": 0.35956743359565735, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 374.2500305175781, + "completions/mean_terminated_length": 374.2500305175781, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 1.3868970853752902, + "grad_norm": 0.6822706460952759, + "kl": 0.0941162109375, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 156793832.0, + "reward": 1.3946430683135986, + "reward_std": 0.10583854466676712, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3946428894996643, + "rewards/curriculum_aware_reward_fn/std": 0.3630673289299011, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 436.58929443359375, + "completions/mean_terminated_length": 436.58929443359375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 1.387928810936291, + "grad_norm": 0.7481700778007507, + "kl": 0.0899658203125, + "learning_rate": 1e-06, + "loss": -0.0189, + "num_tokens": 156897414.0, + "reward": 1.3848215341567993, + "reward_std": 0.13752786815166473, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.39375001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.35780394077301025, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1382.0, + "completions/max_terminated_length": 1382.0, + "completions/mean_length": 446.3482360839844, + "completions/mean_terminated_length": 446.3482360839844, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 1.3889605364972917, + "grad_norm": 0.6150304675102234, + "kl": 0.097412109375, + "learning_rate": 1e-06, + "loss": -0.0227, + "num_tokens": 157013835.0, + "reward": 1.3187501430511475, + "reward_std": 0.09968876093626022, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3187500536441803, + "rewards/curriculum_aware_reward_fn/std": 0.3391912281513214, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1066.0, + "completions/max_terminated_length": 1066.0, + "completions/mean_length": 403.6875305175781, + "completions/mean_terminated_length": 403.6875305175781, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.3899922620582925, + "grad_norm": 0.7923887968063354, + "kl": 0.1036376953125, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 157126682.0, + "reward": 1.3732144832611084, + "reward_std": 0.19832755625247955, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3821428418159485, + "rewards/curriculum_aware_reward_fn/std": 0.36698535084724426, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1755.0, + "completions/max_terminated_length": 1755.0, + "completions/mean_length": 398.4821472167969, + "completions/mean_terminated_length": 398.4821472167969, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 1.3910239876192931, + "grad_norm": 0.6704386472702026, + "kl": 0.094482421875, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 157235243.0, + "reward": 1.4232144355773926, + "reward_std": 0.13495533168315887, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42321428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3870033025741577, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 407.6250305175781, + "completions/mean_terminated_length": 407.6250305175781, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 1.392055713180294, + "grad_norm": 0.9023550152778625, + "kl": 0.106689453125, + "learning_rate": 1e-06, + "loss": 0.0333, + "num_tokens": 157347544.0, + "reward": 1.4281251430511475, + "reward_std": 0.22871477901935577, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4370536208152771, + "rewards/curriculum_aware_reward_fn/std": 0.3764907717704773, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 392.39288330078125, + "completions/mean_terminated_length": 392.39288330078125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 1.3930874387412948, + "grad_norm": 0.9493203163146973, + "kl": 0.1072998046875, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 157461688.0, + "reward": 1.2642858028411865, + "reward_std": 0.24542173743247986, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26428571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.38463911414146423, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 977.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 435.6696472167969, + "completions/mean_terminated_length": 435.6696472167969, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.3941191643022957, + "grad_norm": 0.7578283548355103, + "kl": 0.1094970703125, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 157583220.0, + "reward": 1.3258929252624512, + "reward_std": 0.1972636580467224, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3258928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.34731805324554443, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 434.26788330078125, + "completions/mean_terminated_length": 434.26788330078125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 1.3951508898632965, + "grad_norm": 0.8210800290107727, + "kl": 0.095947265625, + "learning_rate": 1e-06, + "loss": 0.0344, + "num_tokens": 157699639.0, + "reward": 1.305803656578064, + "reward_std": 0.2395620495080948, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.3236607015132904, + "rewards/curriculum_aware_reward_fn/std": 0.37548527121543884, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 416.3750305175781, + "completions/mean_terminated_length": 416.3750305175781, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.3961826154242971, + "grad_norm": 0.9273058772087097, + "kl": 0.1324462890625, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 157818737.0, + "reward": 1.391517996788025, + "reward_std": 0.1689910590648651, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39151787757873535, + "rewards/curriculum_aware_reward_fn/std": 0.373073935508728, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 407.8750305175781, + "completions/mean_terminated_length": 407.8750305175781, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 1.397214340985298, + "grad_norm": 0.7501819729804993, + "kl": 0.113525390625, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 157928371.0, + "reward": 1.3727679252624512, + "reward_std": 0.12758342921733856, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3727678954601288, + "rewards/curriculum_aware_reward_fn/std": 0.3793005049228668, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 402.8571472167969, + "completions/mean_terminated_length": 402.8571472167969, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 1.3982460665462986, + "grad_norm": 0.6688710451126099, + "kl": 0.0921630859375, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 158044940.0, + "reward": 1.4031250476837158, + "reward_std": 0.13010619580745697, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.379976361989975, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 431.5446472167969, + "completions/mean_terminated_length": 431.5446472167969, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 1.3992777921072994, + "grad_norm": 0.6437855362892151, + "kl": 0.0950927734375, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 158162041.0, + "reward": 1.3906251192092896, + "reward_std": 0.09287998080253601, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.390625, + "rewards/curriculum_aware_reward_fn/std": 0.343506783246994, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 401.14288330078125, + "completions/mean_terminated_length": 401.14288330078125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.4003095176683003, + "grad_norm": 0.7588181495666504, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": -0.0142, + "num_tokens": 158274603.0, + "reward": 1.3504464626312256, + "reward_std": 0.17711101472377777, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3504464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.46820029616355896, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 370.2857360839844, + "completions/mean_terminated_length": 370.2857360839844, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.401341243229301, + "grad_norm": 0.6538251042366028, + "kl": 0.097900390625, + "learning_rate": 1e-06, + "loss": 0.0165, + "num_tokens": 158379432.0, + "reward": 1.4915181398391724, + "reward_std": 0.13546860218048096, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4915178716182709, + "rewards/curriculum_aware_reward_fn/std": 0.40757957100868225, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 385.46429443359375, + "completions/mean_terminated_length": 385.46429443359375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 1.4023729687903017, + "grad_norm": 0.7940965890884399, + "kl": 0.09423828125, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 158490742.0, + "reward": 1.4316965341567993, + "reward_std": 0.18486927449703217, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.44062504172325134, + "rewards/curriculum_aware_reward_fn/std": 0.34987330436706543, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 388.4196472167969, + "completions/mean_terminated_length": 388.4196472167969, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 1.4034046943513026, + "grad_norm": 0.7965719103813171, + "kl": 0.097412109375, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 158591554.0, + "reward": 1.341071605682373, + "reward_std": 0.14536845684051514, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3410714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.3188672363758087, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 412.6607360839844, + "completions/mean_terminated_length": 412.6607360839844, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 1.4044364199123034, + "grad_norm": 0.7682105302810669, + "kl": 0.0946044921875, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 158705056.0, + "reward": 1.4508929252624512, + "reward_std": 0.18029843270778656, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4508928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3541887402534485, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 387.6875305175781, + "completions/mean_terminated_length": 387.6875305175781, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 1.405468145473304, + "grad_norm": 0.7575867772102356, + "kl": 0.0928955078125, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 158816097.0, + "reward": 1.4593751430511475, + "reward_std": 0.1781318336725235, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4593750536441803, + "rewards/curriculum_aware_reward_fn/std": 0.3494223952293396, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 353.4732360839844, + "completions/mean_terminated_length": 353.4732360839844, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 1.4064998710343048, + "grad_norm": 0.8406264185905457, + "kl": 0.107421875, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 158921983.0, + "reward": 1.5031250715255737, + "reward_std": 0.1740131974220276, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.503125011920929, + "rewards/curriculum_aware_reward_fn/std": 0.3575619161128998, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 404.2857360839844, + "completions/mean_terminated_length": 404.2857360839844, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.4075315965953057, + "grad_norm": 0.8518685102462769, + "kl": 0.095947265625, + "learning_rate": 1e-06, + "loss": -0.0241, + "num_tokens": 159041176.0, + "reward": 1.344642996788025, + "reward_std": 0.21171006560325623, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34464284777641296, + "rewards/curriculum_aware_reward_fn/std": 0.3280681073665619, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1481.0, + "completions/max_terminated_length": 1481.0, + "completions/mean_length": 394.3125305175781, + "completions/mean_terminated_length": 394.3125305175781, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.4085633221563065, + "grad_norm": 0.7855452299118042, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 159145959.0, + "reward": 1.4522322416305542, + "reward_std": 0.13949735462665558, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.3876696527004242, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 366.3125305175781, + "completions/mean_terminated_length": 366.3125305175781, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 1.4095950477173071, + "grad_norm": 0.9100239872932434, + "kl": 0.104736328125, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 159255796.0, + "reward": 1.4200893640518188, + "reward_std": 0.18777905404567719, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4200893044471741, + "rewards/curriculum_aware_reward_fn/std": 0.34520265460014343, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 377.65179443359375, + "completions/mean_terminated_length": 377.65179443359375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 1.410626773278308, + "grad_norm": 0.708103597164154, + "kl": 0.0955810546875, + "learning_rate": 1e-06, + "loss": 0.0245, + "num_tokens": 159361478.0, + "reward": 1.5017858743667603, + "reward_std": 0.17366045713424683, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5017856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.3746426999568939, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1215.0, + "completions/max_terminated_length": 1215.0, + "completions/mean_length": 455.45538330078125, + "completions/mean_terminated_length": 455.45538330078125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 1.4116584988393086, + "grad_norm": 0.5828267931938171, + "kl": 0.0880126953125, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 159481446.0, + "reward": 1.2611607313156128, + "reward_std": 0.0732208862900734, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2611607313156128, + "rewards/curriculum_aware_reward_fn/std": 0.3630892336368561, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 419.1875305175781, + "completions/mean_terminated_length": 419.1875305175781, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 1.4126902244003094, + "grad_norm": 0.7837874889373779, + "kl": 0.089599609375, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 159592760.0, + "reward": 1.344642996788025, + "reward_std": 0.20070511102676392, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34464284777641296, + "rewards/curriculum_aware_reward_fn/std": 0.3375425934791565, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 392.95538330078125, + "completions/mean_terminated_length": 392.95538330078125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.4137219499613103, + "grad_norm": 0.798795759677887, + "kl": 0.0869140625, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 159702879.0, + "reward": 1.3674108982086182, + "reward_std": 0.1692216545343399, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3445005714893341, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 386.5714416503906, + "completions/mean_terminated_length": 386.5714416503906, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 1.4147536755223111, + "grad_norm": 0.8209942579269409, + "kl": 0.1168212890625, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 159817316.0, + "reward": 1.5406252145767212, + "reward_std": 0.20728908479213715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5406250357627869, + "rewards/curriculum_aware_reward_fn/std": 0.35523951053619385, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1228.0, + "completions/max_terminated_length": 1228.0, + "completions/mean_length": 438.1160888671875, + "completions/mean_terminated_length": 438.1160888671875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 1.415785401083312, + "grad_norm": 0.7205373048782349, + "kl": 0.0789794921875, + "learning_rate": 1e-06, + "loss": 0.0351, + "num_tokens": 159934901.0, + "reward": 1.4044643640518188, + "reward_std": 0.14802031219005585, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4044643044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3880857229232788, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 469.4285888671875, + "completions/mean_terminated_length": 469.4285888671875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.4168171266443126, + "grad_norm": 0.6454539895057678, + "kl": 0.0826416015625, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 160056315.0, + "reward": 1.274553656578064, + "reward_std": 0.1375679075717926, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2745535969734192, + "rewards/curriculum_aware_reward_fn/std": 0.3422892987728119, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 410.90179443359375, + "completions/mean_terminated_length": 410.90179443359375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.4178488522053134, + "grad_norm": 0.717077910900116, + "kl": 0.09716796875, + "learning_rate": 1e-06, + "loss": -0.014, + "num_tokens": 160174940.0, + "reward": 1.2781251668930054, + "reward_std": 0.1405002921819687, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27812501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3249934911727905, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 376.40179443359375, + "completions/mean_terminated_length": 376.40179443359375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 1.418880577766314, + "grad_norm": 0.691324770450592, + "kl": 0.0902099609375, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 160274924.0, + "reward": 1.5562502145767212, + "reward_std": 0.1841638833284378, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5562499761581421, + "rewards/curriculum_aware_reward_fn/std": 0.3889883756637573, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1306.0, + "completions/max_terminated_length": 1306.0, + "completions/mean_length": 423.4464416503906, + "completions/mean_terminated_length": 423.4464416503906, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.4199123033273149, + "grad_norm": 0.7552855014801025, + "kl": 0.091064453125, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 160389459.0, + "reward": 1.520535945892334, + "reward_std": 0.20544345676898956, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5294643044471741, + "rewards/curriculum_aware_reward_fn/std": 0.42688655853271484, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 408.2589416503906, + "completions/mean_terminated_length": 408.2589416503906, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 1.4209440288883157, + "grad_norm": 0.7593014240264893, + "kl": 0.1038818359375, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 160513997.0, + "reward": 1.4910715818405151, + "reward_std": 0.18334133923053741, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5, + "rewards/curriculum_aware_reward_fn/std": 0.3415650427341461, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 415.4732360839844, + "completions/mean_terminated_length": 415.4732360839844, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 1.4219757544493166, + "grad_norm": 0.7567901611328125, + "kl": 0.09130859375, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 160629711.0, + "reward": 1.3964285850524902, + "reward_std": 0.17067018151283264, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39642858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.38471439480781555, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1413.0, + "completions/max_terminated_length": 1413.0, + "completions/mean_length": 431.8660888671875, + "completions/mean_terminated_length": 431.8660888671875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.4230074800103172, + "grad_norm": 0.7414162755012512, + "kl": 0.0986328125, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 160742557.0, + "reward": 1.4924108982086182, + "reward_std": 0.20008407533168793, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.501339316368103, + "rewards/curriculum_aware_reward_fn/std": 0.3692249059677124, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1042.0, + "completions/max_terminated_length": 1042.0, + "completions/mean_length": 417.33929443359375, + "completions/mean_terminated_length": 417.33929443359375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 1.424039205571318, + "grad_norm": 0.6366227269172668, + "kl": 0.1085205078125, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 160861168.0, + "reward": 1.3656251430511475, + "reward_std": 0.1291547268629074, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3656250536441803, + "rewards/curriculum_aware_reward_fn/std": 0.37990227341651917, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 429.7589416503906, + "completions/mean_terminated_length": 429.7589416503906, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 1.4250709311323189, + "grad_norm": 0.6410421133041382, + "kl": 0.08984375, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 160983718.0, + "reward": 1.6232143640518188, + "reward_std": 0.15141195058822632, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6232143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.44402584433555603, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 969.0, + "completions/max_terminated_length": 969.0, + "completions/mean_length": 452.4375305175781, + "completions/mean_terminated_length": 452.4375305175781, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.4261026566933195, + "grad_norm": 0.7187079191207886, + "kl": 0.095703125, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 161103110.0, + "reward": 1.3169643878936768, + "reward_std": 0.1875334084033966, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3169642984867096, + "rewards/curriculum_aware_reward_fn/std": 0.3558743894100189, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1380.0, + "completions/max_terminated_length": 1380.0, + "completions/mean_length": 431.7857360839844, + "completions/mean_terminated_length": 431.7857360839844, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.4271343822543203, + "grad_norm": 0.7007352113723755, + "kl": 0.0953369140625, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 161225565.0, + "reward": 1.3937500715255737, + "reward_std": 0.17483645677566528, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3934214115142822, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1214.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 397.39288330078125, + "completions/mean_terminated_length": 397.39288330078125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.4281661078153212, + "grad_norm": 0.7081319093704224, + "kl": 0.1026611328125, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 161332072.0, + "reward": 1.4406250715255737, + "reward_std": 0.1809663474559784, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44062501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.36702674627304077, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1568.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 461.1785888671875, + "completions/mean_terminated_length": 461.1785888671875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 1.429197833376322, + "grad_norm": 0.6438872218132019, + "kl": 0.089599609375, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 161447200.0, + "reward": 1.4156250953674316, + "reward_std": 0.21118290722370148, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41562503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.3949587643146515, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1009.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 438.0000305175781, + "completions/mean_terminated_length": 438.0000305175781, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 1.4302295589373226, + "grad_norm": 0.6846404671669006, + "kl": 0.092529296875, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 161560831.0, + "reward": 1.426785945892334, + "reward_std": 0.23958255350589752, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.45357146859169006, + "rewards/curriculum_aware_reward_fn/std": 0.3801797926425934, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1300.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 520.7142944335938, + "completions/mean_terminated_length": 520.7142944335938, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.4312612844983235, + "grad_norm": 0.7028166055679321, + "kl": 0.0889892578125, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 161683208.0, + "reward": 1.3093751668930054, + "reward_std": 0.16449230909347534, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.35235437750816345, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 384.2232360839844, + "completions/mean_terminated_length": 384.2232360839844, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.4322930100593243, + "grad_norm": 0.6343652009963989, + "kl": 0.0977783203125, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 161788184.0, + "reward": 1.3741072416305542, + "reward_std": 0.13007023930549622, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37410715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.40089890360832214, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 403.0000305175781, + "completions/mean_terminated_length": 403.0000305175781, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.433324735620325, + "grad_norm": 0.7477560043334961, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 161900632.0, + "reward": 1.3531250953674316, + "reward_std": 0.20165258646011353, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.36205360293388367, + "rewards/curriculum_aware_reward_fn/std": 0.37807273864746094, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 436.08929443359375, + "completions/mean_terminated_length": 436.08929443359375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.4343564611813258, + "grad_norm": 0.6067072749137878, + "kl": 0.0994873046875, + "learning_rate": 1e-06, + "loss": 0.0227, + "num_tokens": 162016317.0, + "reward": 1.2183036804199219, + "reward_std": 0.20316217839717865, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.22723214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3544370234012604, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3818.0, + "completions/max_terminated_length": 3818.0, + "completions/mean_length": 489.7500305175781, + "completions/mean_terminated_length": 489.7500305175781, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 1.4353881867423266, + "grad_norm": 0.679914116859436, + "kl": 0.0909423828125, + "learning_rate": 1e-06, + "loss": -0.0185, + "num_tokens": 162136227.0, + "reward": 1.3629463911056519, + "reward_std": 0.2041923701763153, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36294645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.3852427303791046, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 435.76788330078125, + "completions/mean_terminated_length": 435.76788330078125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.4364199123033274, + "grad_norm": 0.7845681309700012, + "kl": 0.09423828125, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 162252772.0, + "reward": 1.2741073369979858, + "reward_std": 0.20398284494876862, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2830357253551483, + "rewards/curriculum_aware_reward_fn/std": 0.3526957631111145, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 471.3125305175781, + "completions/mean_terminated_length": 471.3125305175781, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.437451637864328, + "grad_norm": 0.6001479029655457, + "kl": 0.086669921875, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 162384463.0, + "reward": 1.3040179014205933, + "reward_std": 0.153373122215271, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3040178418159485, + "rewards/curriculum_aware_reward_fn/std": 0.3661424517631531, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 421.71429443359375, + "completions/mean_terminated_length": 421.71429443359375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.438483363425329, + "grad_norm": 0.6689612865447998, + "kl": 0.10498046875, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 162498069.0, + "reward": 1.424553632736206, + "reward_std": 0.1970020830631256, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42455360293388367, + "rewards/curriculum_aware_reward_fn/std": 0.402972936630249, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 452.1250305175781, + "completions/mean_terminated_length": 452.1250305175781, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 1.4395150889863295, + "grad_norm": 0.7905884385108948, + "kl": 0.09716796875, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 162618476.0, + "reward": 1.2879464626312256, + "reward_std": 0.248353973031044, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3422423005104065, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1315.0, + "completions/max_terminated_length": 1315.0, + "completions/mean_length": 457.4107360839844, + "completions/mean_terminated_length": 457.4107360839844, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.4405468145473304, + "grad_norm": 0.7530561685562134, + "kl": 0.099853515625, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 162738894.0, + "reward": 1.3598215579986572, + "reward_std": 0.1974332481622696, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35982146859169006, + "rewards/curriculum_aware_reward_fn/std": 0.3718010187149048, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1701.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 487.3482360839844, + "completions/mean_terminated_length": 487.3482360839844, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 1.4415785401083312, + "grad_norm": 0.6047545671463013, + "kl": 0.085205078125, + "learning_rate": 1e-06, + "loss": 0.0158, + "num_tokens": 162868454.0, + "reward": 1.2325893640518188, + "reward_std": 0.15268686413764954, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.24151787161827087, + "rewards/curriculum_aware_reward_fn/std": 0.33280736207962036, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 429.1160888671875, + "completions/mean_terminated_length": 429.1160888671875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 1.442610265669332, + "grad_norm": 0.5883904099464417, + "kl": 0.0916748046875, + "learning_rate": 1e-06, + "loss": 0.0152, + "num_tokens": 162982602.0, + "reward": 1.4254467487335205, + "reward_std": 0.14535757899284363, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42544645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.381674587726593, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 412.71429443359375, + "completions/mean_terminated_length": 412.71429443359375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 1.4436419912303327, + "grad_norm": 0.6786606907844543, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": 0.0324, + "num_tokens": 163090993.0, + "reward": 1.4593751430511475, + "reward_std": 0.16988961398601532, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4683035910129547, + "rewards/curriculum_aware_reward_fn/std": 0.37626636028289795, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 406.5446472167969, + "completions/mean_terminated_length": 406.5446472167969, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.4446737167913335, + "grad_norm": 0.6487584710121155, + "kl": 0.0985107421875, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 163202774.0, + "reward": 1.5339287519454956, + "reward_std": 0.1595512330532074, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5339285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.36305850744247437, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1228.0, + "completions/max_terminated_length": 1228.0, + "completions/mean_length": 429.4196472167969, + "completions/mean_terminated_length": 429.4196472167969, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 1.4457054423523343, + "grad_norm": 0.6628140807151794, + "kl": 0.0904541015625, + "learning_rate": 1e-06, + "loss": 0.0303, + "num_tokens": 163320715.0, + "reward": 1.5446429252624512, + "reward_std": 0.20707891881465912, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5535714626312256, + "rewards/curriculum_aware_reward_fn/std": 0.3792307674884796, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 475.26788330078125, + "completions/mean_terminated_length": 475.26788330078125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.446737167913335, + "grad_norm": 0.7178117036819458, + "kl": 0.0933837890625, + "learning_rate": 1e-06, + "loss": 0.0262, + "num_tokens": 163448845.0, + "reward": 1.4919644594192505, + "reward_std": 0.18081454932689667, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5008928775787354, + "rewards/curriculum_aware_reward_fn/std": 0.47267603874206543, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1358.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 540.919677734375, + "completions/mean_terminated_length": 540.919677734375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 1.4477688934743358, + "grad_norm": 0.6339001655578613, + "kl": 0.080322265625, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 163579764.0, + "reward": 1.2714287042617798, + "reward_std": 0.18801215291023254, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27142858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.32959944009780884, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 511.232177734375, + "completions/mean_terminated_length": 511.232177734375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.4488006190353366, + "grad_norm": 0.7088522911071777, + "kl": 0.1148681640625, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 163706686.0, + "reward": 1.3120536804199219, + "reward_std": 0.15960483253002167, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3209821879863739, + "rewards/curriculum_aware_reward_fn/std": 0.3356879651546478, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 489.6785888671875, + "completions/mean_terminated_length": 489.6785888671875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 1.4498323445963375, + "grad_norm": 0.7049396634101868, + "kl": 0.08203125, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 163831845.0, + "reward": 1.4357143640518188, + "reward_std": 0.17000097036361694, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4357143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3761126697063446, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1513.0, + "completions/max_terminated_length": 1513.0, + "completions/mean_length": 482.6785888671875, + "completions/mean_terminated_length": 482.6785888671875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 1.450864070157338, + "grad_norm": 0.6695181131362915, + "kl": 0.0791015625, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 163948163.0, + "reward": 1.3866074085235596, + "reward_std": 0.19195009768009186, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38660717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.35080230236053467, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 460.14288330078125, + "completions/mean_terminated_length": 460.14288330078125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 1.451895795718339, + "grad_norm": 0.5940765738487244, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 164069804.0, + "reward": 1.3875001668930054, + "reward_std": 0.15338915586471558, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38750001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.41359224915504456, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1256.0, + "completions/max_terminated_length": 1256.0, + "completions/mean_length": 469.4285888671875, + "completions/mean_terminated_length": 469.4285888671875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 1.4529275212793398, + "grad_norm": 0.681818425655365, + "kl": 0.0908203125, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 164202390.0, + "reward": 1.4303573369979858, + "reward_std": 0.13860322535037994, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4303571581840515, + "rewards/curriculum_aware_reward_fn/std": 0.3885962963104248, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1762.0, + "completions/max_terminated_length": 1762.0, + "completions/mean_length": 476.3125305175781, + "completions/mean_terminated_length": 476.3125305175781, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 1.4539592468403404, + "grad_norm": 0.6444785594940186, + "kl": 0.089599609375, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 164331365.0, + "reward": 1.282589316368103, + "reward_std": 0.15449091792106628, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28258928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.36091160774230957, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1121.0, + "completions/max_terminated_length": 1121.0, + "completions/mean_length": 510.232177734375, + "completions/mean_terminated_length": 510.232177734375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 1.4549909724013412, + "grad_norm": 0.791489839553833, + "kl": 0.0924072265625, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 164450006.0, + "reward": 1.372321605682373, + "reward_std": 0.25465095043182373, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3812499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3428894877433777, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1113.0, + "completions/max_terminated_length": 1113.0, + "completions/mean_length": 483.7500305175781, + "completions/mean_terminated_length": 483.7500305175781, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 1.456022697962342, + "grad_norm": 0.6734673380851746, + "kl": 0.091064453125, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 164572702.0, + "reward": 1.3053573369979858, + "reward_std": 0.18237224221229553, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3142857253551483, + "rewards/curriculum_aware_reward_fn/std": 0.37195783853530884, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 478.58038330078125, + "completions/mean_terminated_length": 478.58038330078125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 1.457054423523343, + "grad_norm": 0.6534205079078674, + "kl": 0.0865478515625, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 164696282.0, + "reward": 1.3339285850524902, + "reward_std": 0.18611732125282288, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33392858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3818644881248474, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1331.0, + "completions/max_terminated_length": 1331.0, + "completions/mean_length": 470.33929443359375, + "completions/mean_terminated_length": 470.33929443359375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 1.4580861490843435, + "grad_norm": 0.5302649736404419, + "kl": 0.097412109375, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 164828620.0, + "reward": 1.3053573369979858, + "reward_std": 0.11551646143198013, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3053571581840515, + "rewards/curriculum_aware_reward_fn/std": 0.353448748588562, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1141.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 500.6964416503906, + "completions/mean_terminated_length": 500.6964416503906, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 1.4591178746453444, + "grad_norm": 0.7022354006767273, + "kl": 0.0821533203125, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 164960242.0, + "reward": 1.3616071939468384, + "reward_std": 0.2165532261133194, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3705357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.30473393201828003, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 958.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 404.1250305175781, + "completions/mean_terminated_length": 404.1250305175781, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 1.460149600206345, + "grad_norm": 0.6341533660888672, + "kl": 0.0887451171875, + "learning_rate": 1e-06, + "loss": 0.0222, + "num_tokens": 165070009.0, + "reward": 1.4808037281036377, + "reward_std": 0.12062705308198929, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48080354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3547137975692749, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1158.0, + "completions/max_terminated_length": 1158.0, + "completions/mean_length": 447.89288330078125, + "completions/mean_terminated_length": 447.89288330078125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.4611813257673458, + "grad_norm": 0.7164600491523743, + "kl": 0.0855712890625, + "learning_rate": 1e-06, + "loss": 0.0359, + "num_tokens": 165180520.0, + "reward": 1.4535716772079468, + "reward_std": 0.21277686953544617, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45357146859169006, + "rewards/curriculum_aware_reward_fn/std": 0.3822476267814636, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 476.9285888671875, + "completions/mean_terminated_length": 476.9285888671875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 1.4622130513283467, + "grad_norm": 0.6117580533027649, + "kl": 0.088623046875, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 165299372.0, + "reward": 1.5316965579986572, + "reward_std": 0.20724934339523315, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5316964387893677, + "rewards/curriculum_aware_reward_fn/std": 0.481925368309021, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1294.0, + "completions/mean_length": 559.8392944335938, + "completions/mean_terminated_length": 527.9819946289062, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.4632447768893475, + "grad_norm": 0.6708288192749023, + "kl": 0.07763671875, + "learning_rate": 1e-06, + "loss": 0.0337, + "num_tokens": 165434265.0, + "reward": 1.3084824085235596, + "reward_std": 0.2409285604953766, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.31741073727607727, + "rewards/curriculum_aware_reward_fn/std": 0.3392300605773926, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 420.4464416503906, + "completions/mean_terminated_length": 420.4464416503906, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.4642765024503481, + "grad_norm": 0.6302593350410461, + "kl": 0.093505859375, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 165555926.0, + "reward": 1.479017972946167, + "reward_std": 0.13766011595726013, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47901788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3736018240451813, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1119.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 491.419677734375, + "completions/mean_terminated_length": 491.419677734375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 1.465308228011349, + "grad_norm": 0.7415760159492493, + "kl": 0.10595703125, + "learning_rate": 1e-06, + "loss": 0.0227, + "num_tokens": 165684052.0, + "reward": 1.3625000715255737, + "reward_std": 0.19681335985660553, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36250001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.37395501136779785, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 429.0446472167969, + "completions/mean_terminated_length": 429.0446472167969, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 1.4663399535723498, + "grad_norm": 0.8263067603111267, + "kl": 0.1051025390625, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 165805031.0, + "reward": 1.393303632736206, + "reward_std": 0.20443706214427948, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39330360293388367, + "rewards/curriculum_aware_reward_fn/std": 0.3297211229801178, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1190.0, + "completions/max_terminated_length": 1190.0, + "completions/mean_length": 465.71429443359375, + "completions/mean_terminated_length": 465.71429443359375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.4673716791333504, + "grad_norm": 0.7716458439826965, + "kl": 0.09326171875, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 165916145.0, + "reward": 1.3147321939468384, + "reward_std": 0.222005695104599, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3147321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.34021976590156555, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1123.0, + "completions/max_terminated_length": 1123.0, + "completions/mean_length": 482.8125305175781, + "completions/mean_terminated_length": 482.8125305175781, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 1.4684034046943513, + "grad_norm": 0.7555353045463562, + "kl": 0.120849609375, + "learning_rate": 1e-06, + "loss": 0.043, + "num_tokens": 166035684.0, + "reward": 1.2803572416305542, + "reward_std": 0.16637490689754486, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28035715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.34979772567749023, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1108.0, + "completions/max_terminated_length": 1108.0, + "completions/mean_length": 457.77679443359375, + "completions/mean_terminated_length": 457.77679443359375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 1.469435130255352, + "grad_norm": 0.6749049425125122, + "kl": 0.080810546875, + "learning_rate": 1e-06, + "loss": 0.0325, + "num_tokens": 166155016.0, + "reward": 1.395982265472412, + "reward_std": 0.1725309193134308, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39598217606544495, + "rewards/curriculum_aware_reward_fn/std": 0.36447781324386597, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 401.27679443359375, + "completions/mean_terminated_length": 401.27679443359375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 1.470466855816353, + "grad_norm": 0.7543728351593018, + "kl": 0.0909423828125, + "learning_rate": 1e-06, + "loss": -0.0089, + "num_tokens": 166257408.0, + "reward": 1.5339287519454956, + "reward_std": 0.2014743983745575, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5428571105003357, + "rewards/curriculum_aware_reward_fn/std": 0.36630964279174805, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 441.357177734375, + "completions/mean_terminated_length": 441.357177734375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 1.4714985813773536, + "grad_norm": 0.7767645120620728, + "kl": 0.1064453125, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 166378646.0, + "reward": 1.4120537042617798, + "reward_std": 0.21909524500370026, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3727935552597046, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 426.33929443359375, + "completions/mean_terminated_length": 426.33929443359375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 1.4725303069383544, + "grad_norm": 0.5623325109481812, + "kl": 0.0906982421875, + "learning_rate": 1e-06, + "loss": 0.023, + "num_tokens": 166488206.0, + "reward": 1.4102680683135986, + "reward_std": 0.12621724605560303, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4102678596973419, + "rewards/curriculum_aware_reward_fn/std": 0.3717585802078247, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1199.0, + "completions/mean_length": 536.044677734375, + "completions/mean_terminated_length": 503.9729919433594, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 1.4735620324993552, + "grad_norm": 0.5052303671836853, + "kl": 0.076171875, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 166611845.0, + "reward": 1.4236608743667603, + "reward_std": 0.14933934807777405, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4325893223285675, + "rewards/curriculum_aware_reward_fn/std": 0.37994247674942017, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 410.89288330078125, + "completions/mean_terminated_length": 410.89288330078125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.4745937580603559, + "grad_norm": 0.6693888306617737, + "kl": 0.095703125, + "learning_rate": 1e-06, + "loss": 0.0297, + "num_tokens": 166724099.0, + "reward": 1.462053656578064, + "reward_std": 0.18026289343833923, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4620535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.36590293049812317, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 465.9107360839844, + "completions/mean_terminated_length": 465.9107360839844, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 1.4756254836213567, + "grad_norm": 0.598745584487915, + "kl": 0.0985107421875, + "learning_rate": 1e-06, + "loss": -0.0226, + "num_tokens": 166836033.0, + "reward": 1.3633930683135986, + "reward_std": 0.129484161734581, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3633928894996643, + "rewards/curriculum_aware_reward_fn/std": 0.3225030303001404, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 403.76788330078125, + "completions/mean_terminated_length": 403.76788330078125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.4766572091823575, + "grad_norm": 0.864762544631958, + "kl": 0.10986328125, + "learning_rate": 1e-06, + "loss": 0.0353, + "num_tokens": 166952648.0, + "reward": 1.456696629524231, + "reward_std": 0.20940305292606354, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45669645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.37026259303092957, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 422.0714416503906, + "completions/mean_terminated_length": 422.0714416503906, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.4776889347433584, + "grad_norm": 0.7573868036270142, + "kl": 0.092529296875, + "learning_rate": 1e-06, + "loss": 0.0251, + "num_tokens": 167074190.0, + "reward": 1.4294644594192505, + "reward_std": 0.20020684599876404, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43839287757873535, + "rewards/curriculum_aware_reward_fn/std": 0.37476611137390137, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 469.2500305175781, + "completions/mean_terminated_length": 469.2500305175781, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 1.478720660304359, + "grad_norm": 0.7239991426467896, + "kl": 0.0771484375, + "learning_rate": 1e-06, + "loss": -0.0155, + "num_tokens": 167188346.0, + "reward": 1.3848215341567993, + "reward_std": 0.1801411509513855, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3647240698337555, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 947.0, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 429.77679443359375, + "completions/mean_terminated_length": 429.77679443359375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 1.4797523858653598, + "grad_norm": 0.7500477433204651, + "kl": 0.0946044921875, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 167298809.0, + "reward": 1.3700894117355347, + "reward_std": 0.16702376306056976, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3700892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.356184720993042, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 457.64288330078125, + "completions/mean_terminated_length": 457.64288330078125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 1.4807841114263605, + "grad_norm": 0.6691403388977051, + "kl": 0.0888671875, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 167418070.0, + "reward": 1.2071428298950195, + "reward_std": 0.1764199137687683, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.20714284479618073, + "rewards/curriculum_aware_reward_fn/std": 0.30349743366241455, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 471.919677734375, + "completions/mean_terminated_length": 471.919677734375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.4818158369873613, + "grad_norm": 0.7403013110160828, + "kl": 0.08349609375, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 167546714.0, + "reward": 1.3919644355773926, + "reward_std": 0.19533541798591614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39196428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3597768247127533, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 429.7410888671875, + "completions/mean_terminated_length": 429.7410888671875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 1.4828475625483621, + "grad_norm": 0.6121811866760254, + "kl": 0.088623046875, + "learning_rate": 1e-06, + "loss": 0.0255, + "num_tokens": 167643670.0, + "reward": 1.395982265472412, + "reward_std": 0.14923794567584991, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39598211646080017, + "rewards/curriculum_aware_reward_fn/std": 0.3318755328655243, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 457.2410888671875, + "completions/mean_terminated_length": 457.2410888671875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.483879288109363, + "grad_norm": 0.6797091960906982, + "kl": 0.0784912109375, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 167762984.0, + "reward": 1.3156250715255737, + "reward_std": 0.18089859187602997, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3589920401573181, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 373.7589416503906, + "completions/mean_terminated_length": 373.7589416503906, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 1.4849110136703638, + "grad_norm": 0.728999674320221, + "kl": 0.1011962890625, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 167867271.0, + "reward": 1.4187501668930054, + "reward_std": 0.14087806642055511, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41875001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.37830376625061035, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1249.0, + "completions/max_terminated_length": 1249.0, + "completions/mean_length": 489.46429443359375, + "completions/mean_terminated_length": 489.46429443359375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.4859427392313644, + "grad_norm": 0.6123307347297668, + "kl": 0.081787109375, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 167989445.0, + "reward": 1.3000000715255737, + "reward_std": 0.1632949411869049, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29999998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3484198749065399, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1670.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 426.02679443359375, + "completions/mean_terminated_length": 426.02679443359375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 1.4869744647923653, + "grad_norm": 0.7175002098083496, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 168097430.0, + "reward": 1.4450894594192505, + "reward_std": 0.20554915070533752, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.3961503803730011, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 412.8214416503906, + "completions/mean_terminated_length": 412.8214416503906, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 1.488006190353366, + "grad_norm": 0.7144314646720886, + "kl": 0.091064453125, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 168203138.0, + "reward": 1.520535945892334, + "reward_std": 0.18925592303276062, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5205357074737549, + "rewards/curriculum_aware_reward_fn/std": 0.3949885368347168, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 444.64288330078125, + "completions/mean_terminated_length": 444.64288330078125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.4890379159143667, + "grad_norm": 0.5935328602790833, + "kl": 0.0838623046875, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 168323057.0, + "reward": 1.3866074085235596, + "reward_std": 0.0912431851029396, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581, + "rewards/curriculum_aware_reward_fn/std": 0.37554657459259033, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 399.7500305175781, + "completions/mean_terminated_length": 399.7500305175781, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 1.4900696414753676, + "grad_norm": 0.5947428345680237, + "kl": 0.0933837890625, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 168429051.0, + "reward": 1.4129464626312256, + "reward_std": 0.14674913883209229, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.421875, + "rewards/curriculum_aware_reward_fn/std": 0.3914503753185272, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1437.0, + "completions/max_terminated_length": 1437.0, + "completions/mean_length": 467.64288330078125, + "completions/mean_terminated_length": 467.64288330078125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 1.4911013670363684, + "grad_norm": 0.7172960638999939, + "kl": 0.0882568359375, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 168550909.0, + "reward": 1.3223215341567993, + "reward_std": 0.19947509467601776, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32232141494750977, + "rewards/curriculum_aware_reward_fn/std": 0.345814049243927, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 465.83038330078125, + "completions/mean_terminated_length": 465.83038330078125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 1.492133092597369, + "grad_norm": 0.7651225924491882, + "kl": 0.0965576171875, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 168679326.0, + "reward": 1.2294644117355347, + "reward_std": 0.15720400214195251, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22946429252624512, + "rewards/curriculum_aware_reward_fn/std": 0.29906412959098816, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1656.0, + "completions/max_terminated_length": 1656.0, + "completions/mean_length": 458.1875305175781, + "completions/mean_terminated_length": 458.1875305175781, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.4931648181583699, + "grad_norm": 0.6342578530311584, + "kl": 0.096435546875, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 168795218.0, + "reward": 1.4379466772079468, + "reward_std": 0.13654667139053345, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43794646859169006, + "rewards/curriculum_aware_reward_fn/std": 0.3718256652355194, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 483.2500305175781, + "completions/mean_terminated_length": 483.2500305175781, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.4941965437193707, + "grad_norm": 0.6862965226173401, + "kl": 0.082275390625, + "learning_rate": 1e-06, + "loss": -0.0112, + "num_tokens": 168917333.0, + "reward": 1.3508929014205933, + "reward_std": 0.18875552713871002, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3508928716182709, + "rewards/curriculum_aware_reward_fn/std": 0.35754305124282837, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 440.3660888671875, + "completions/mean_terminated_length": 440.3660888671875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 1.4952282692803713, + "grad_norm": 0.6744213104248047, + "kl": 0.0999755859375, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 169033182.0, + "reward": 1.3272322416305542, + "reward_std": 0.126448854804039, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32723215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.36235520243644714, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1382.0, + "completions/max_terminated_length": 1382.0, + "completions/mean_length": 476.20538330078125, + "completions/mean_terminated_length": 476.20538330078125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 1.4962599948413722, + "grad_norm": 0.7253400087356567, + "kl": 0.0897216796875, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 169151848.0, + "reward": 1.3625000715255737, + "reward_std": 0.18081489205360413, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.37647607922554016, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 468.95538330078125, + "completions/mean_terminated_length": 468.95538330078125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.497291720402373, + "grad_norm": 0.6010491251945496, + "kl": 0.0977783203125, + "learning_rate": 1e-06, + "loss": -0.0119, + "num_tokens": 169275946.0, + "reward": 1.4964287281036377, + "reward_std": 0.15095055103302002, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5053572058677673, + "rewards/curriculum_aware_reward_fn/std": 0.3826388418674469, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 404.5535888671875, + "completions/mean_terminated_length": 404.5535888671875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 1.4983234459633739, + "grad_norm": 0.6166358590126038, + "kl": 0.0943603515625, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 169382233.0, + "reward": 1.4875000715255737, + "reward_std": 0.15182271599769592, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48750001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.37099212408065796, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 406.71429443359375, + "completions/mean_terminated_length": 406.71429443359375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 1.4993551715243745, + "grad_norm": 0.7371792197227478, + "kl": 0.100341796875, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 169485269.0, + "reward": 1.5120537281036377, + "reward_std": 0.23051097989082336, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5209821462631226, + "rewards/curriculum_aware_reward_fn/std": 0.36109432578086853, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 465.02679443359375, + "completions/mean_terminated_length": 465.02679443359375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.5003868970853753, + "grad_norm": 0.6901764869689941, + "kl": 0.0924072265625, + "learning_rate": 1e-06, + "loss": 0.0338, + "num_tokens": 169602452.0, + "reward": 1.266964316368103, + "reward_std": 0.1574702113866806, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26696428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.34113970398902893, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 432.0625305175781, + "completions/mean_terminated_length": 432.0625305175781, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.501418622646376, + "grad_norm": 0.809800386428833, + "kl": 0.0904541015625, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 169714830.0, + "reward": 1.3727679252624512, + "reward_std": 0.17047983407974243, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3727678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.35481810569763184, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 410.2232360839844, + "completions/mean_terminated_length": 410.2232360839844, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.5024503482073768, + "grad_norm": 0.8028994798660278, + "kl": 0.109375, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 169837551.0, + "reward": 1.5437501668930054, + "reward_std": 0.18651224672794342, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.543749988079071, + "rewards/curriculum_aware_reward_fn/std": 0.38038182258605957, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 458.8125305175781, + "completions/mean_terminated_length": 458.8125305175781, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 1.5034820737683776, + "grad_norm": 0.7478047013282776, + "kl": 0.0880126953125, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 169960275.0, + "reward": 1.4183037281036377, + "reward_std": 0.1733304113149643, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41830354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3492588996887207, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1865.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 500.0357360839844, + "completions/mean_terminated_length": 500.0357360839844, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 1.5045137993293785, + "grad_norm": 0.6545856595039368, + "kl": 0.0909423828125, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 170078768.0, + "reward": 1.4303573369979858, + "reward_std": 0.14356939494609833, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4303571879863739, + "rewards/curriculum_aware_reward_fn/std": 0.3660328686237335, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 443.9107360839844, + "completions/mean_terminated_length": 443.9107360839844, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.5055455248903793, + "grad_norm": 0.6835969686508179, + "kl": 0.10302734375, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 170194993.0, + "reward": 1.3500001430511475, + "reward_std": 0.2047101855278015, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3665028214454651, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 481.107177734375, + "completions/mean_terminated_length": 481.107177734375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 1.50657725045138, + "grad_norm": 0.632583737373352, + "kl": 0.0914306640625, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 170317680.0, + "reward": 1.2718751430511475, + "reward_std": 0.1807551085948944, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2718749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.29391586780548096, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 443.2500305175781, + "completions/mean_terminated_length": 443.2500305175781, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 1.5076089760123808, + "grad_norm": 0.7975127100944519, + "kl": 0.0994873046875, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 170440950.0, + "reward": 1.3781250715255737, + "reward_std": 0.15491509437561035, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.351559042930603, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 450.3660888671875, + "completions/mean_terminated_length": 450.3660888671875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 1.5086407015733814, + "grad_norm": 0.7866511344909668, + "kl": 0.0982666015625, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 170561251.0, + "reward": 1.4392858743667603, + "reward_std": 0.1659286618232727, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4392857253551483, + "rewards/curriculum_aware_reward_fn/std": 0.3631958067417145, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 440.3214416503906, + "completions/mean_terminated_length": 440.3214416503906, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.5096724271343822, + "grad_norm": 0.7658423781394958, + "kl": 0.11376953125, + "learning_rate": 1e-06, + "loss": 0.0465, + "num_tokens": 170683612.0, + "reward": 1.4495537281036377, + "reward_std": 0.23375743627548218, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44955357909202576, + "rewards/curriculum_aware_reward_fn/std": 0.3733519911766052, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 451.2500305175781, + "completions/mean_terminated_length": 451.2500305175781, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.510704152695383, + "grad_norm": 0.7433845400810242, + "kl": 0.095947265625, + "learning_rate": 1e-06, + "loss": 0.0258, + "num_tokens": 170803396.0, + "reward": 1.3691965341567993, + "reward_std": 0.15630899369716644, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36919641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.34888792037963867, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 420.0446472167969, + "completions/mean_terminated_length": 420.0446472167969, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 1.511735878256384, + "grad_norm": 0.6753164529800415, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": -0.0128, + "num_tokens": 170915723.0, + "reward": 1.4419645071029663, + "reward_std": 0.14553149044513702, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.441964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3703877925872803, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 465.0982360839844, + "completions/mean_terminated_length": 465.0982360839844, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.5127676038173847, + "grad_norm": 0.7211045622825623, + "kl": 0.0921630859375, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 171031822.0, + "reward": 1.3616071939468384, + "reward_std": 0.1773756593465805, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3616071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3393145203590393, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 433.9375305175781, + "completions/mean_terminated_length": 433.9375305175781, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.5137993293783853, + "grad_norm": 0.8377368450164795, + "kl": 0.0975341796875, + "learning_rate": 1e-06, + "loss": -0.0254, + "num_tokens": 171144655.0, + "reward": 1.360267996788025, + "reward_std": 0.2237682342529297, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36026787757873535, + "rewards/curriculum_aware_reward_fn/std": 0.3718797266483307, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2812.0, + "completions/max_terminated_length": 2812.0, + "completions/mean_length": 445.70538330078125, + "completions/mean_terminated_length": 445.70538330078125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 1.514831054939386, + "grad_norm": 0.7094552516937256, + "kl": 0.0989990234375, + "learning_rate": 1e-06, + "loss": 0.0374, + "num_tokens": 171264070.0, + "reward": 1.3732144832611084, + "reward_std": 0.16332145035266876, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3732143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3847269117832184, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 418.0625305175781, + "completions/mean_terminated_length": 418.0625305175781, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.5158627805003868, + "grad_norm": 0.8003520369529724, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 171364912.0, + "reward": 1.3218752145767212, + "reward_std": 0.1855483502149582, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3218750059604645, + "rewards/curriculum_aware_reward_fn/std": 0.37373098731040955, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 387.4375305175781, + "completions/mean_terminated_length": 387.4375305175781, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.5168945060613876, + "grad_norm": 0.7462090849876404, + "kl": 0.1134033203125, + "learning_rate": 1e-06, + "loss": 0.0188, + "num_tokens": 171464501.0, + "reward": 1.5415178537368774, + "reward_std": 0.19770653545856476, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774, + "rewards/curriculum_aware_reward_fn/std": 0.38107770681381226, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 464.1875305175781, + "completions/mean_terminated_length": 464.1875305175781, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 1.5179262316223885, + "grad_norm": 0.6659581065177917, + "kl": 0.088623046875, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 171584250.0, + "reward": 1.3665181398391724, + "reward_std": 0.13865076005458832, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3665178716182709, + "rewards/curriculum_aware_reward_fn/std": 0.3345959186553955, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 454.71429443359375, + "completions/mean_terminated_length": 454.71429443359375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.5189579571833893, + "grad_norm": 0.66074538230896, + "kl": 0.0784912109375, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 171702518.0, + "reward": 1.4281251430511475, + "reward_std": 0.09557029604911804, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4281249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3529018461704254, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3126.0, + "completions/max_terminated_length": 3126.0, + "completions/mean_length": 476.2500305175781, + "completions/mean_terminated_length": 476.2500305175781, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 1.51998968274439, + "grad_norm": 0.7866209149360657, + "kl": 0.0963134765625, + "learning_rate": 1e-06, + "loss": 0.0541, + "num_tokens": 171820139.0, + "reward": 1.2075893878936768, + "reward_std": 0.26282820105552673, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.2254464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.33858445286750793, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 457.89288330078125, + "completions/mean_terminated_length": 457.89288330078125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.5210214083053908, + "grad_norm": 0.7619090676307678, + "kl": 0.0849609375, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 171948801.0, + "reward": 1.4491074085235596, + "reward_std": 0.1694876253604889, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44910717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3511873185634613, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 457.0000305175781, + "completions/mean_terminated_length": 457.0000305175781, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.5220531338663914, + "grad_norm": 0.626873791217804, + "kl": 0.0914306640625, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 172072526.0, + "reward": 1.40223228931427, + "reward_std": 0.1566895991563797, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4111607074737549, + "rewards/curriculum_aware_reward_fn/std": 0.36537712812423706, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 436.4732360839844, + "completions/mean_terminated_length": 436.4732360839844, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.5230848594273922, + "grad_norm": 0.6276397109031677, + "kl": 0.08154296875, + "learning_rate": 1e-06, + "loss": 0.0366, + "num_tokens": 172188023.0, + "reward": 1.4066966772079468, + "reward_std": 0.1416996717453003, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4156250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3868928849697113, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 480.26788330078125, + "completions/mean_terminated_length": 480.26788330078125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 1.524116584988393, + "grad_norm": 0.7986273169517517, + "kl": 0.0885009765625, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 172305037.0, + "reward": 1.3424108028411865, + "reward_std": 0.1933390200138092, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.35133931040763855, + "rewards/curriculum_aware_reward_fn/std": 0.3585166931152344, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 454.8839416503906, + "completions/mean_terminated_length": 454.8839416503906, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.525148310549394, + "grad_norm": 0.73015296459198, + "kl": 0.0775146484375, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 172431521.0, + "reward": 1.4276787042617798, + "reward_std": 0.18670453131198883, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3737344741821289, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 456.83929443359375, + "completions/mean_terminated_length": 456.83929443359375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 1.5261800361103948, + "grad_norm": 0.6959778070449829, + "kl": 0.0828857421875, + "learning_rate": 1e-06, + "loss": -0.0118, + "num_tokens": 172552393.0, + "reward": 1.33973228931427, + "reward_std": 0.18844622373580933, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33973217010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3715161681175232, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2458.0, + "completions/max_terminated_length": 2458.0, + "completions/mean_length": 501.2410888671875, + "completions/mean_terminated_length": 501.2410888671875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.5272117616713954, + "grad_norm": 0.6077437400817871, + "kl": 0.0804443359375, + "learning_rate": 1e-06, + "loss": 0.0435, + "num_tokens": 172678861.0, + "reward": 1.3825894594192505, + "reward_std": 0.15254563093185425, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38258931040763855, + "rewards/curriculum_aware_reward_fn/std": 0.3776235580444336, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 424.4285888671875, + "completions/mean_terminated_length": 424.4285888671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.5282434872323962, + "grad_norm": 0.7133572697639465, + "kl": 0.0950927734375, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 172785540.0, + "reward": 1.5232144594192505, + "reward_std": 0.15948817133903503, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5232142806053162, + "rewards/curriculum_aware_reward_fn/std": 0.3813164234161377, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 416.4732360839844, + "completions/mean_terminated_length": 416.4732360839844, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.5292752127933968, + "grad_norm": 0.8125452399253845, + "kl": 0.09814453125, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 172902934.0, + "reward": 1.425446629524231, + "reward_std": 0.16105496883392334, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42544645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.3703538775444031, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 480.5089416503906, + "completions/mean_terminated_length": 480.5089416503906, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 1.5303069383543977, + "grad_norm": 0.6588963866233826, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": 0.0381, + "num_tokens": 173030111.0, + "reward": 1.4031251668930054, + "reward_std": 0.16077668964862823, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.37097883224487305, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 432.0982360839844, + "completions/mean_terminated_length": 432.0982360839844, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.5313386639153985, + "grad_norm": 0.7676278352737427, + "kl": 0.0838623046875, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 173144348.0, + "reward": 1.6035715341567993, + "reward_std": 0.20362050831317902, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6035714149475098, + "rewards/curriculum_aware_reward_fn/std": 0.4420323073863983, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 465.5625305175781, + "completions/mean_terminated_length": 465.5625305175781, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 1.5323703894763994, + "grad_norm": 0.79965740442276, + "kl": 0.08837890625, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 173260478.0, + "reward": 1.3312500715255737, + "reward_std": 0.1637294739484787, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33125001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.33498185873031616, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1007.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 434.7500305175781, + "completions/mean_terminated_length": 434.7500305175781, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.5334021150374002, + "grad_norm": 0.7042688131332397, + "kl": 0.0911865234375, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 173375474.0, + "reward": 1.395535945892334, + "reward_std": 0.14024481177330017, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39553573727607727, + "rewards/curriculum_aware_reward_fn/std": 0.33748897910118103, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 426.90179443359375, + "completions/mean_terminated_length": 426.90179443359375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 1.5344338405984008, + "grad_norm": 0.9660199284553528, + "kl": 0.1873779296875, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 173484614.0, + "reward": 1.2669644355773926, + "reward_std": 0.12665453553199768, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26696428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.34800291061401367, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 408.71429443359375, + "completions/mean_terminated_length": 408.71429443359375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 1.5354655661594014, + "grad_norm": 0.7146320343017578, + "kl": 0.089111328125, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 173587853.0, + "reward": 1.4758931398391724, + "reward_std": 0.14861474931240082, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4758928716182709, + "rewards/curriculum_aware_reward_fn/std": 0.37781867384910583, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 439.3125305175781, + "completions/mean_terminated_length": 439.3125305175781, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.5364972917204023, + "grad_norm": 0.7023717164993286, + "kl": 0.0887451171875, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 173704438.0, + "reward": 1.296875238418579, + "reward_std": 0.18027296662330627, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.34681832790374756, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 444.6875305175781, + "completions/mean_terminated_length": 444.6875305175781, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 1.5375290172814031, + "grad_norm": 0.6490708589553833, + "kl": 0.1005859375, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 173825776.0, + "reward": 1.3754465579986572, + "reward_std": 0.18801386654376984, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37544646859169006, + "rewards/curriculum_aware_reward_fn/std": 0.38502344489097595, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 411.7321472167969, + "completions/mean_terminated_length": 411.7321472167969, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 1.538560742842404, + "grad_norm": 0.7477344870567322, + "kl": 0.090576171875, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 173934806.0, + "reward": 1.4120535850524902, + "reward_std": 0.15478430688381195, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.34817707538604736, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 446.9910888671875, + "completions/mean_terminated_length": 446.9910888671875, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.5395924684034048, + "grad_norm": 0.46143588423728943, + "kl": 0.078125, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 174051887.0, + "reward": 1.3267858028411865, + "reward_std": 0.07234279066324234, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32678574323654175, + "rewards/curriculum_aware_reward_fn/std": 0.36955833435058594, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 368.7232360839844, + "completions/mean_terminated_length": 368.7232360839844, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 1.5406241939644054, + "grad_norm": 0.6577993035316467, + "kl": 0.1026611328125, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 174161896.0, + "reward": 1.4656251668930054, + "reward_std": 0.16244220733642578, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.36809906363487244, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 428.4910888671875, + "completions/mean_terminated_length": 428.4910888671875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 1.5416559195254063, + "grad_norm": 0.7847018241882324, + "kl": 0.0850830078125, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 174275722.0, + "reward": 1.3973214626312256, + "reward_std": 0.24171169102191925, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.40625, + "rewards/curriculum_aware_reward_fn/std": 0.35375240445137024, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 417.96429443359375, + "completions/mean_terminated_length": 417.96429443359375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.5426876450864069, + "grad_norm": 0.7562360763549805, + "kl": 0.0975341796875, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 174390273.0, + "reward": 1.4120535850524902, + "reward_std": 0.18743880093097687, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.37687939405441284, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 1126.0, + "completions/mean_length": 473.83929443359375, + "completions/mean_terminated_length": 473.83929443359375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.5437193706474077, + "grad_norm": 0.6407991051673889, + "kl": 0.07470703125, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 174505937.0, + "reward": 1.3714287281036377, + "reward_std": 0.17726092040538788, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37142854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3861168622970581, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 450.4375305175781, + "completions/mean_terminated_length": 450.4375305175781, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.5447510962084086, + "grad_norm": 0.7203906178474426, + "kl": 0.0848388671875, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 174631881.0, + "reward": 1.4767858982086182, + "reward_std": 0.2179284691810608, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.48571428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.34986668825149536, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 480.9464416503906, + "completions/mean_terminated_length": 480.9464416503906, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.5457828217694094, + "grad_norm": 0.7707893252372742, + "kl": 0.0919189453125, + "learning_rate": 1e-06, + "loss": -0.0256, + "num_tokens": 174765148.0, + "reward": 1.2928574085235596, + "reward_std": 0.16916993260383606, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29285717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.33556845784187317, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 484.5982360839844, + "completions/mean_terminated_length": 484.5982360839844, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 1.5468145473304102, + "grad_norm": 0.7403567433357239, + "kl": 0.0938720703125, + "learning_rate": 1e-06, + "loss": -0.017, + "num_tokens": 174886028.0, + "reward": 1.250892996788025, + "reward_std": 0.17120662331581116, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25089287757873535, + "rewards/curriculum_aware_reward_fn/std": 0.32403579354286194, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 445.1250305175781, + "completions/mean_terminated_length": 445.1250305175781, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.5478462728914109, + "grad_norm": 0.7902706265449524, + "kl": 0.0830078125, + "learning_rate": 1e-06, + "loss": 0.0348, + "num_tokens": 175003271.0, + "reward": 1.3799108266830444, + "reward_std": 0.2304457426071167, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37991073727607727, + "rewards/curriculum_aware_reward_fn/std": 0.36151954531669617, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 426.1250305175781, + "completions/mean_terminated_length": 426.1250305175781, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 1.5488779984524117, + "grad_norm": 0.827652096748352, + "kl": 0.0955810546875, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 175120878.0, + "reward": 1.3937503099441528, + "reward_std": 0.15829935669898987, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.36342495679855347, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 497.3125305175781, + "completions/mean_terminated_length": 497.3125305175781, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 1.5499097240134123, + "grad_norm": 0.7216241359710693, + "kl": 0.0885009765625, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 175248362.0, + "reward": 1.3397324085235596, + "reward_std": 0.17447860538959503, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33973217010498047, + "rewards/curriculum_aware_reward_fn/std": 0.34557294845581055, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 429.4732360839844, + "completions/mean_terminated_length": 429.4732360839844, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 1.5509414495744132, + "grad_norm": 0.7612575888633728, + "kl": 0.0924072265625, + "learning_rate": 1e-06, + "loss": 0.0357, + "num_tokens": 175362711.0, + "reward": 1.3892858028411865, + "reward_std": 0.21689125895500183, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38928574323654175, + "rewards/curriculum_aware_reward_fn/std": 0.37539342045783997, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 450.3125305175781, + "completions/mean_terminated_length": 450.3125305175781, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.551973175135414, + "grad_norm": 0.6470363736152649, + "kl": 0.0928955078125, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 175483062.0, + "reward": 1.325446605682373, + "reward_std": 0.18153639137744904, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3343749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.36724144220352173, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 409.8482360839844, + "completions/mean_terminated_length": 409.8482360839844, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 1.5530049006964148, + "grad_norm": 0.7157784700393677, + "kl": 0.0926513671875, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 175589690.0, + "reward": 1.464285969734192, + "reward_std": 0.18744036555290222, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4642857015132904, + "rewards/curriculum_aware_reward_fn/std": 0.37712928652763367, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 429.01788330078125, + "completions/mean_terminated_length": 429.01788330078125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 1.5540366262574157, + "grad_norm": 0.6890504360198975, + "kl": 0.0814208984375, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 175689708.0, + "reward": 1.3250000476837158, + "reward_std": 0.15183790028095245, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32500001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3779389262199402, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 452.9910888671875, + "completions/mean_terminated_length": 452.9910888671875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 1.5550683518184163, + "grad_norm": 0.7142993807792664, + "kl": 0.08837890625, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 175803296.0, + "reward": 1.3285716772079468, + "reward_std": 0.21572059392929077, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3486460745334625, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1064.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 449.6160888671875, + "completions/mean_terminated_length": 449.6160888671875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.556100077379417, + "grad_norm": 0.743757426738739, + "kl": 0.0986328125, + "learning_rate": 1e-06, + "loss": -0.0195, + "num_tokens": 175928840.0, + "reward": 1.2982144355773926, + "reward_std": 0.13886137306690216, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29821428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3399740755558014, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 418.01788330078125, + "completions/mean_terminated_length": 418.01788330078125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 1.5571318029404178, + "grad_norm": 0.6630829572677612, + "kl": 0.0830078125, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 176041605.0, + "reward": 1.2790180444717407, + "reward_std": 0.18763485550880432, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2790178656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3841826915740967, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1044.0, + "completions/max_terminated_length": 1044.0, + "completions/mean_length": 541.5625, + "completions/mean_terminated_length": 541.5625, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 1.5581635285014186, + "grad_norm": 0.6752259135246277, + "kl": 0.0755615234375, + "learning_rate": 1e-06, + "loss": 0.018, + "num_tokens": 176186274.0, + "reward": 1.3950893878936768, + "reward_std": 0.19315579533576965, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.395089328289032, + "rewards/curriculum_aware_reward_fn/std": 0.38787707686424255, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 411.9910888671875, + "completions/mean_terminated_length": 411.9910888671875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 1.5591952540624194, + "grad_norm": 0.6581578850746155, + "kl": 0.0977783203125, + "learning_rate": 1e-06, + "loss": 0.0303, + "num_tokens": 176304111.0, + "reward": 1.3366073369979858, + "reward_std": 0.13431201875209808, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3366071879863739, + "rewards/curriculum_aware_reward_fn/std": 0.3686462938785553, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 509.232177734375, + "completions/mean_terminated_length": 509.232177734375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 1.5602269796234203, + "grad_norm": 0.6564598679542542, + "kl": 0.083984375, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 176432090.0, + "reward": 1.3691965341567993, + "reward_std": 0.16052451729774475, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.37330886721611023, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 450.64288330078125, + "completions/mean_terminated_length": 450.64288330078125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 1.561258705184421, + "grad_norm": 0.7571966052055359, + "kl": 0.0841064453125, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 176553409.0, + "reward": 1.3388394117355347, + "reward_std": 0.13540898263454437, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3388392925262451, + "rewards/curriculum_aware_reward_fn/std": 0.35347121953964233, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1221.0, + "completions/max_terminated_length": 1221.0, + "completions/mean_length": 482.1964416503906, + "completions/mean_terminated_length": 482.1964416503906, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.5622904307454217, + "grad_norm": 0.7158542275428772, + "kl": 0.080322265625, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 176671682.0, + "reward": 1.369642972946167, + "reward_std": 0.19208936393260956, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36964288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3834032714366913, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 487.6339416503906, + "completions/mean_terminated_length": 487.6339416503906, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 1.5633221563064224, + "grad_norm": 0.7497124075889587, + "kl": 0.0919189453125, + "learning_rate": 1e-06, + "loss": -0.0183, + "num_tokens": 176799685.0, + "reward": 1.3531250953674316, + "reward_std": 0.22372855246067047, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.36205360293388367, + "rewards/curriculum_aware_reward_fn/std": 0.3610074520111084, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 513.1875, + "completions/mean_terminated_length": 513.1875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.5643538818674232, + "grad_norm": 0.615464448928833, + "kl": 0.077880859375, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 176924222.0, + "reward": 1.3504465818405151, + "reward_std": 0.14913828670978546, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3504464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.34579166769981384, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1156.0, + "completions/max_terminated_length": 1156.0, + "completions/mean_length": 466.1964416503906, + "completions/mean_terminated_length": 466.1964416503906, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.565385607428424, + "grad_norm": 0.5593492984771729, + "kl": 0.07958984375, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 177047360.0, + "reward": 1.3919644355773926, + "reward_std": 0.11146697402000427, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39196428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.382709264755249, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1133.0, + "completions/max_terminated_length": 1133.0, + "completions/mean_length": 541.0089721679688, + "completions/mean_terminated_length": 541.0089721679688, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 1.5664173329894249, + "grad_norm": 0.69778972864151, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 177180326.0, + "reward": 1.322767972946167, + "reward_std": 0.15394167602062225, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32276788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.34640294313430786, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 443.5357360839844, + "completions/mean_terminated_length": 443.5357360839844, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.5674490585504257, + "grad_norm": 0.7092502117156982, + "kl": 0.098388671875, + "learning_rate": 1e-06, + "loss": 0.0189, + "num_tokens": 177299134.0, + "reward": 1.3830357789993286, + "reward_std": 0.17929960787296295, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38303571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.36837565898895264, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 427.2946472167969, + "completions/mean_terminated_length": 427.2946472167969, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 1.5684807841114263, + "grad_norm": 0.6543611288070679, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 177412966.0, + "reward": 1.4455360174179077, + "reward_std": 0.17789359390735626, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3989035487174988, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1044.0, + "completions/max_terminated_length": 1044.0, + "completions/mean_length": 491.0714416503906, + "completions/mean_terminated_length": 491.0714416503906, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.5695125096724272, + "grad_norm": 0.7969875335693359, + "kl": 0.0858154296875, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 177543160.0, + "reward": 1.260267972946167, + "reward_std": 0.17761948704719543, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26026788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.36050131916999817, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 995.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 483.232177734375, + "completions/mean_terminated_length": 483.232177734375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 1.5705442352334278, + "grad_norm": 0.6591067314147949, + "kl": 0.08642578125, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 177666960.0, + "reward": 1.333035945892334, + "reward_std": 0.16056941449642181, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33303573727607727, + "rewards/curriculum_aware_reward_fn/std": 0.3560009300708771, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 533.2053833007812, + "completions/mean_terminated_length": 501.1081237792969, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 1.5715759607944286, + "grad_norm": 0.7623335719108582, + "kl": 0.0941162109375, + "learning_rate": 1e-06, + "loss": 0.0693, + "num_tokens": 177798597.0, + "reward": 1.3187501430511475, + "reward_std": 0.23580670356750488, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3276786208152771, + "rewards/curriculum_aware_reward_fn/std": 0.3404977321624756, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 490.5625305175781, + "completions/mean_terminated_length": 490.5625305175781, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 1.5726076863554295, + "grad_norm": 0.4751555323600769, + "kl": 0.0804443359375, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 177921286.0, + "reward": 1.3678573369979858, + "reward_std": 0.10132154077291489, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3678571581840515, + "rewards/curriculum_aware_reward_fn/std": 0.4004421532154083, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 458.107177734375, + "completions/mean_terminated_length": 458.107177734375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 1.5736394119164303, + "grad_norm": 0.8363772630691528, + "kl": 0.110107421875, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 178042034.0, + "reward": 1.4857144355773926, + "reward_std": 0.16093096137046814, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48571428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.338879257440567, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 482.1250305175781, + "completions/mean_terminated_length": 449.56756591796875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 1.5746711374774311, + "grad_norm": 0.7333095073699951, + "kl": 0.09375, + "learning_rate": 1e-06, + "loss": 0.041, + "num_tokens": 178159753.0, + "reward": 1.3812501430511475, + "reward_std": 0.15002723038196564, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3901786208152771, + "rewards/curriculum_aware_reward_fn/std": 0.38868218660354614, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 420.8839416503906, + "completions/mean_terminated_length": 420.8839416503906, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 1.5757028630384318, + "grad_norm": 0.7141842842102051, + "kl": 0.0958251953125, + "learning_rate": 1e-06, + "loss": 0.0263, + "num_tokens": 178269507.0, + "reward": 1.580357313156128, + "reward_std": 0.16252008080482483, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5803571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4622876048088074, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 440.294677734375, + "completions/mean_terminated_length": 440.294677734375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 1.5767345885994324, + "grad_norm": 0.7308587431907654, + "kl": 0.0908203125, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 178383325.0, + "reward": 1.453125238418579, + "reward_std": 0.22767595946788788, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.453125, + "rewards/curriculum_aware_reward_fn/std": 0.38104182481765747, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 447.9285888671875, + "completions/mean_terminated_length": 447.9285888671875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.5777663141604332, + "grad_norm": 0.7281290888786316, + "kl": 0.0958251953125, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 178500105.0, + "reward": 1.4352680444717407, + "reward_std": 0.18116579949855804, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4352678954601288, + "rewards/curriculum_aware_reward_fn/std": 0.3676464259624481, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 457.90179443359375, + "completions/mean_terminated_length": 457.90179443359375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 1.578798039721434, + "grad_norm": 0.7084707021713257, + "kl": 0.085693359375, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 178619668.0, + "reward": 1.337053656578064, + "reward_std": 0.17020417749881744, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3370535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.36789795756340027, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 498.8482360839844, + "completions/mean_terminated_length": 498.8482360839844, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 1.579829765282435, + "grad_norm": 0.657572865486145, + "kl": 0.0875244140625, + "learning_rate": 1e-06, + "loss": 0.0222, + "num_tokens": 178739274.0, + "reward": 1.338392972946167, + "reward_std": 0.23384582996368408, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33839288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.36650174856185913, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 488.5625305175781, + "completions/mean_terminated_length": 488.5625305175781, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 1.5808614908434357, + "grad_norm": 0.7009193897247314, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 178864476.0, + "reward": 1.4299108982086182, + "reward_std": 0.2491622120141983, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42991071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3837029039859772, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 496.5535888671875, + "completions/mean_terminated_length": 496.5535888671875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 1.5818932164044366, + "grad_norm": 0.5118904709815979, + "kl": 0.09130859375, + "learning_rate": 1e-06, + "loss": -0.0107, + "num_tokens": 178988270.0, + "reward": 1.3665179014205933, + "reward_std": 0.0743982195854187, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3665178716182709, + "rewards/curriculum_aware_reward_fn/std": 0.3804565966129303, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1134.0, + "completions/max_terminated_length": 1134.0, + "completions/mean_length": 497.5000305175781, + "completions/mean_terminated_length": 497.5000305175781, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 1.5829249419654372, + "grad_norm": 0.6875550746917725, + "kl": 0.0859375, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 179107340.0, + "reward": 1.309821605682373, + "reward_std": 0.12580278515815735, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3098214268684387, + "rewards/curriculum_aware_reward_fn/std": 0.33293938636779785, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 421.5089416503906, + "completions/mean_terminated_length": 421.5089416503906, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.5839566675264378, + "grad_norm": 0.6761475205421448, + "kl": 0.091064453125, + "learning_rate": 1e-06, + "loss": 0.0168, + "num_tokens": 179214567.0, + "reward": 1.4991072416305542, + "reward_std": 0.11850286275148392, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49910715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.39082854986190796, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 441.3750305175781, + "completions/mean_terminated_length": 441.3750305175781, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 1.5849883930874387, + "grad_norm": 0.710762083530426, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 179324845.0, + "reward": 1.3973215818405151, + "reward_std": 0.17368969321250916, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3973214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.41961154341697693, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 465.46429443359375, + "completions/mean_terminated_length": 465.46429443359375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 1.5860201186484395, + "grad_norm": 0.6757381558418274, + "kl": 0.0928955078125, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 179443421.0, + "reward": 1.3093751668930054, + "reward_std": 0.17009426653385162, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3792792558670044, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1391.0, + "completions/max_terminated_length": 1391.0, + "completions/mean_length": 463.20538330078125, + "completions/mean_terminated_length": 463.20538330078125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 1.5870518442094403, + "grad_norm": 0.6932691931724548, + "kl": 0.0926513671875, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 179550824.0, + "reward": 1.481696605682373, + "reward_std": 0.17351554334163666, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4816964268684387, + "rewards/curriculum_aware_reward_fn/std": 0.39134761691093445, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1168.0, + "completions/max_terminated_length": 1168.0, + "completions/mean_length": 485.39288330078125, + "completions/mean_terminated_length": 485.39288330078125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.5880835697704412, + "grad_norm": 0.5837637186050415, + "kl": 0.092041015625, + "learning_rate": 1e-06, + "loss": 0.0179, + "num_tokens": 179672115.0, + "reward": 1.4620537757873535, + "reward_std": 0.09847518801689148, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4620535969734192, + "rewards/curriculum_aware_reward_fn/std": 0.3418448567390442, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1359.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 480.294677734375, + "completions/mean_terminated_length": 480.294677734375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.5891152953314418, + "grad_norm": 0.6784064769744873, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 179794977.0, + "reward": 1.4705358743667603, + "reward_std": 0.16050057113170624, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4705357551574707, + "rewards/curriculum_aware_reward_fn/std": 0.362769216299057, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1400.0, + "completions/max_terminated_length": 1400.0, + "completions/mean_length": 537.482177734375, + "completions/mean_terminated_length": 537.482177734375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.5901470208924426, + "grad_norm": 0.6119048595428467, + "kl": 0.095458984375, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 179925091.0, + "reward": 1.3517858982086182, + "reward_std": 0.1388634294271469, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3608004152774811, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3483.0, + "completions/max_terminated_length": 3483.0, + "completions/mean_length": 472.4464416503906, + "completions/mean_terminated_length": 472.4464416503906, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.5911787464534433, + "grad_norm": 0.7622206211090088, + "kl": 0.095458984375, + "learning_rate": 1e-06, + "loss": 0.0493, + "num_tokens": 180039318.0, + "reward": 1.4330357313156128, + "reward_std": 0.18418605625629425, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.3550506830215454, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 472.4732360839844, + "completions/mean_terminated_length": 472.4732360839844, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.592210472014444, + "grad_norm": 0.623796284198761, + "kl": 0.097900390625, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 180161339.0, + "reward": 1.4433037042617798, + "reward_std": 0.12381229549646378, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44330358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.37032344937324524, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1320.0, + "completions/max_terminated_length": 1320.0, + "completions/mean_length": 533.6428833007812, + "completions/mean_terminated_length": 533.6428833007812, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 1.593242197575445, + "grad_norm": 0.6682934761047363, + "kl": 0.0863037109375, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 180298086.0, + "reward": 1.3924108743667603, + "reward_std": 0.1946992129087448, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3924107551574707, + "rewards/curriculum_aware_reward_fn/std": 0.378606379032135, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 477.169677734375, + "completions/mean_terminated_length": 477.169677734375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.5942739231364458, + "grad_norm": 0.5888639092445374, + "kl": 0.0933837890625, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 180418183.0, + "reward": 1.4174107313156128, + "reward_std": 0.09873569756746292, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4174107611179352, + "rewards/curriculum_aware_reward_fn/std": 0.36556199193000793, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1291.0, + "completions/max_terminated_length": 1291.0, + "completions/mean_length": 483.89288330078125, + "completions/mean_terminated_length": 483.89288330078125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.5953056486974466, + "grad_norm": 0.7629478573799133, + "kl": 0.10107421875, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 180547300.0, + "reward": 1.3714287281036377, + "reward_std": 0.17599254846572876, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37142854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3239476680755615, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1213.0, + "completions/max_terminated_length": 1213.0, + "completions/mean_length": 511.4285888671875, + "completions/mean_terminated_length": 511.4285888671875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.5963373742584472, + "grad_norm": 0.6964306235313416, + "kl": 0.091064453125, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 180673691.0, + "reward": 1.3066965341567993, + "reward_std": 0.1090642586350441, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30669641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.36959290504455566, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1661.0, + "completions/max_terminated_length": 1661.0, + "completions/mean_length": 520.3482666015625, + "completions/mean_terminated_length": 520.3482666015625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 1.597369099819448, + "grad_norm": 0.6426225900650024, + "kl": 0.0911865234375, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 180802992.0, + "reward": 1.428125023841858, + "reward_std": 0.14593568444252014, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4281250536441803, + "rewards/curriculum_aware_reward_fn/std": 0.3542395234107971, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1078.0, + "completions/max_terminated_length": 1078.0, + "completions/mean_length": 482.89288330078125, + "completions/mean_terminated_length": 482.89288330078125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 1.5984008253804487, + "grad_norm": 0.7396179437637329, + "kl": 0.0921630859375, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 180928315.0, + "reward": 1.362946629524231, + "reward_std": 0.2182372510433197, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.362946480512619, + "rewards/curriculum_aware_reward_fn/std": 0.36072656512260437, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 488.1160888671875, + "completions/mean_terminated_length": 488.1160888671875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 1.5994325509414495, + "grad_norm": 0.6970319151878357, + "kl": 0.107666015625, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 181055763.0, + "reward": 1.391964316368103, + "reward_std": 0.15285947918891907, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39196428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.388839453458786, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 527.8035888671875, + "completions/mean_terminated_length": 527.8035888671875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 1.6004642765024504, + "grad_norm": 0.6473516225814819, + "kl": 0.0899658203125, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 181188055.0, + "reward": 1.3830358982086182, + "reward_std": 0.16586752235889435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38303571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3928157687187195, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1181.0, + "completions/max_terminated_length": 1181.0, + "completions/mean_length": 513.125, + "completions/mean_terminated_length": 513.125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 1.6014960020634512, + "grad_norm": 0.7452658414840698, + "kl": 0.108642578125, + "learning_rate": 1e-06, + "loss": 0.0142, + "num_tokens": 181321265.0, + "reward": 1.3812501430511475, + "reward_std": 0.22256553173065186, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3901785910129547, + "rewards/curriculum_aware_reward_fn/std": 0.39110833406448364, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 499.96429443359375, + "completions/mean_terminated_length": 499.96429443359375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 1.602527727624452, + "grad_norm": 0.6822240948677063, + "kl": 0.0894775390625, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 181450264.0, + "reward": 1.325446605682373, + "reward_std": 0.18837910890579224, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3254464566707611, + "rewards/curriculum_aware_reward_fn/std": 0.34809622168540955, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 467.9285888671875, + "completions/mean_terminated_length": 467.9285888671875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.6035594531854527, + "grad_norm": 0.8079191446304321, + "kl": 0.0975341796875, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 181565609.0, + "reward": 1.427232265472412, + "reward_std": 0.20859649777412415, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43616074323654175, + "rewards/curriculum_aware_reward_fn/std": 0.38602909445762634, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 464.2857360839844, + "completions/mean_terminated_length": 464.2857360839844, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 1.6045911787464533, + "grad_norm": 0.6506158113479614, + "kl": 0.088623046875, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 181687679.0, + "reward": 1.3696428537368774, + "reward_std": 0.20352153480052948, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36964288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3838142454624176, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 470.08038330078125, + "completions/mean_terminated_length": 470.08038330078125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 1.6056229043074541, + "grad_norm": 0.8110823631286621, + "kl": 0.08837890625, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 181802999.0, + "reward": 1.3977681398391724, + "reward_std": 0.23524494469165802, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3977678716182709, + "rewards/curriculum_aware_reward_fn/std": 0.3921176493167877, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 505.7232360839844, + "completions/mean_terminated_length": 505.7232360839844, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 1.606654629868455, + "grad_norm": 0.7459127306938171, + "kl": 0.099853515625, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 181933259.0, + "reward": 1.2517858743667603, + "reward_std": 0.1917809098958969, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2517857253551483, + "rewards/curriculum_aware_reward_fn/std": 0.3410983979701996, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 453.857177734375, + "completions/mean_terminated_length": 453.857177734375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 1.6076863554294558, + "grad_norm": 0.5741895437240601, + "kl": 0.109619140625, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 182042618.0, + "reward": 1.3982144594192505, + "reward_std": 0.11688823997974396, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39821428060531616, + "rewards/curriculum_aware_reward_fn/std": 0.5040345191955566, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 475.544677734375, + "completions/mean_terminated_length": 475.544677734375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 1.6087180809904567, + "grad_norm": 0.6165961623191833, + "kl": 0.0887451171875, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 182170842.0, + "reward": 1.3263394832611084, + "reward_std": 0.12872597575187683, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3263393044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3373158574104309, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1046.0, + "completions/max_terminated_length": 1046.0, + "completions/mean_length": 464.5982360839844, + "completions/mean_terminated_length": 464.5982360839844, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.6097498065514573, + "grad_norm": 0.7019606828689575, + "kl": 0.0860595703125, + "learning_rate": 1e-06, + "loss": 0.0158, + "num_tokens": 182277995.0, + "reward": 1.5254465341567993, + "reward_std": 0.13480007648468018, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5254464745521545, + "rewards/curriculum_aware_reward_fn/std": 0.38072922825813293, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1210.0, + "completions/max_terminated_length": 1210.0, + "completions/mean_length": 544.0892944335938, + "completions/mean_terminated_length": 544.0892944335938, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 1.6107815321124581, + "grad_norm": 0.6408945918083191, + "kl": 0.085693359375, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 182418789.0, + "reward": 1.211160659790039, + "reward_std": 0.19551792740821838, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21116070449352264, + "rewards/curriculum_aware_reward_fn/std": 0.30117708444595337, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 474.0000305175781, + "completions/mean_terminated_length": 474.0000305175781, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 1.6118132576734587, + "grad_norm": 0.6941054463386536, + "kl": 0.0889892578125, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 182536564.0, + "reward": 1.4075894355773926, + "reward_std": 0.16013428568840027, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40758928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.377951443195343, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 481.02679443359375, + "completions/mean_terminated_length": 481.02679443359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.6128449832344596, + "grad_norm": 0.7757281064987183, + "kl": 0.0960693359375, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 182659424.0, + "reward": 1.2102679014205933, + "reward_std": 0.1673770248889923, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21026785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.33665943145751953, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 415.96429443359375, + "completions/mean_terminated_length": 415.96429443359375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.6138767087954604, + "grad_norm": 0.7108755707740784, + "kl": 0.0997314453125, + "learning_rate": 1e-06, + "loss": -0.0193, + "num_tokens": 182779505.0, + "reward": 1.4982143640518188, + "reward_std": 0.15602374076843262, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3791246712207794, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 432.46429443359375, + "completions/mean_terminated_length": 432.46429443359375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 1.6149084343564613, + "grad_norm": 0.6921008229255676, + "kl": 0.096923828125, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 182896625.0, + "reward": 1.497321605682373, + "reward_std": 0.1866346001625061, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5062499642372131, + "rewards/curriculum_aware_reward_fn/std": 0.356858491897583, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 437.52679443359375, + "completions/mean_terminated_length": 437.52679443359375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 1.615940159917462, + "grad_norm": 0.7729184031486511, + "kl": 0.08447265625, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 183018582.0, + "reward": 1.4276787042617798, + "reward_std": 0.2603314518928528, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.43774518370628357, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 437.9821472167969, + "completions/mean_terminated_length": 437.9821472167969, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 1.6169718854784627, + "grad_norm": 0.6360601186752319, + "kl": 0.095703125, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 183134241.0, + "reward": 1.4236608743667603, + "reward_std": 0.12023750692605972, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4236607253551483, + "rewards/curriculum_aware_reward_fn/std": 0.4888836741447449, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1051.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 459.6160888671875, + "completions/mean_terminated_length": 459.6160888671875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 1.6180036110394636, + "grad_norm": 0.6790236830711365, + "kl": 0.0955810546875, + "learning_rate": 1e-06, + "loss": 0.0291, + "num_tokens": 183260149.0, + "reward": 1.3875001668930054, + "reward_std": 0.17968744039535522, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38750001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3827061057090759, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1218.0, + "completions/max_terminated_length": 1218.0, + "completions/mean_length": 483.294677734375, + "completions/mean_terminated_length": 483.294677734375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 1.6190353366004642, + "grad_norm": 0.7099661231040955, + "kl": 0.0947265625, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 183381926.0, + "reward": 1.3736608028411865, + "reward_std": 0.20166639983654022, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37366071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.38845735788345337, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 422.6875305175781, + "completions/mean_terminated_length": 422.6875305175781, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 1.620067062161465, + "grad_norm": 0.5651583075523376, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 183489943.0, + "reward": 1.5111607313156128, + "reward_std": 0.1064288541674614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5111607313156128, + "rewards/curriculum_aware_reward_fn/std": 0.3640184998512268, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 477.3035888671875, + "completions/mean_terminated_length": 477.3035888671875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 1.6210987877224659, + "grad_norm": 0.6416997313499451, + "kl": 0.0816650390625, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 183616404.0, + "reward": 1.4120537042617798, + "reward_std": 0.13270309567451477, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3730955123901367, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 507.76788330078125, + "completions/mean_terminated_length": 507.76788330078125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 1.6221305132834667, + "grad_norm": 0.6909761428833008, + "kl": 0.0885009765625, + "learning_rate": 1e-06, + "loss": -0.015, + "num_tokens": 183746851.0, + "reward": 1.3093750476837158, + "reward_std": 0.22479885816574097, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30937501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3407418429851532, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1098.0, + "completions/max_terminated_length": 1098.0, + "completions/mean_length": 518.5, + "completions/mean_terminated_length": 518.5, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.6231622388444675, + "grad_norm": 0.6929990649223328, + "kl": 0.0860595703125, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 183871976.0, + "reward": 1.3669644594192505, + "reward_std": 0.18981127440929413, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36696428060531616, + "rewards/curriculum_aware_reward_fn/std": 0.36745738983154297, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1227.0, + "completions/max_terminated_length": 1227.0, + "completions/mean_length": 489.4285888671875, + "completions/mean_terminated_length": 489.4285888671875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 1.6241939644054681, + "grad_norm": 0.7451035976409912, + "kl": 0.0928955078125, + "learning_rate": 1e-06, + "loss": -0.0145, + "num_tokens": 183992529.0, + "reward": 1.4111608266830444, + "reward_std": 0.18551412224769592, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41116073727607727, + "rewards/curriculum_aware_reward_fn/std": 0.36234408617019653, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 458.294677734375, + "completions/mean_terminated_length": 458.294677734375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 1.6252256899664688, + "grad_norm": 0.7171928882598877, + "kl": 0.098388671875, + "learning_rate": 1e-06, + "loss": -0.0206, + "num_tokens": 184108195.0, + "reward": 1.439732313156128, + "reward_std": 0.15318885445594788, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4397321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.38672444224357605, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1066.0, + "completions/max_terminated_length": 1066.0, + "completions/mean_length": 468.1875305175781, + "completions/mean_terminated_length": 468.1875305175781, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.6262574155274696, + "grad_norm": 0.7789207100868225, + "kl": 0.078369140625, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 184219012.0, + "reward": 1.5397323369979858, + "reward_std": 0.2542872130870819, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5397321581840515, + "rewards/curriculum_aware_reward_fn/std": 0.3428903818130493, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 462.0000305175781, + "completions/mean_terminated_length": 462.0000305175781, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 1.6272891410884704, + "grad_norm": 0.6634514927864075, + "kl": 0.0869140625, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 184335793.0, + "reward": 1.3437501192092896, + "reward_std": 0.1134234219789505, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.3749549686908722, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 490.40179443359375, + "completions/mean_terminated_length": 490.40179443359375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 1.6283208666494713, + "grad_norm": 0.5586764812469482, + "kl": 0.0897216796875, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 184452255.0, + "reward": 1.266517996788025, + "reward_std": 0.10985580831766129, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.26651787757873535, + "rewards/curriculum_aware_reward_fn/std": 0.35586389899253845, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1086.0, + "completions/max_terminated_length": 1086.0, + "completions/mean_length": 470.5714416503906, + "completions/mean_terminated_length": 470.5714416503906, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 1.6293525922104721, + "grad_norm": 0.7115040421485901, + "kl": 0.087158203125, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 184573930.0, + "reward": 1.33973228931427, + "reward_std": 0.1624385416507721, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3397321403026581, + "rewards/curriculum_aware_reward_fn/std": 0.3505554497241974, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 451.357177734375, + "completions/mean_terminated_length": 451.357177734375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 1.6303843177714727, + "grad_norm": 0.6336712837219238, + "kl": 0.0947265625, + "learning_rate": 1e-06, + "loss": 0.0262, + "num_tokens": 184691339.0, + "reward": 1.3830358982086182, + "reward_std": 0.1295861452817917, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38303571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.39561524987220764, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 461.5000305175781, + "completions/mean_terminated_length": 461.5000305175781, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 1.6314160433324736, + "grad_norm": 0.7340649962425232, + "kl": 0.08837890625, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 184807350.0, + "reward": 1.3513394594192505, + "reward_std": 0.19727714359760284, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35133931040763855, + "rewards/curriculum_aware_reward_fn/std": 0.393326073884964, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 478.794677734375, + "completions/mean_terminated_length": 478.794677734375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 1.6324477688934742, + "grad_norm": 0.6357326507568359, + "kl": 0.0958251953125, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 184927181.0, + "reward": 1.364732265472412, + "reward_std": 0.14448943734169006, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36473211646080017, + "rewards/curriculum_aware_reward_fn/std": 0.3300040066242218, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 458.1785888671875, + "completions/mean_terminated_length": 458.1785888671875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.633479494454475, + "grad_norm": 0.8217315673828125, + "kl": 0.09326171875, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 185043345.0, + "reward": 1.4013394117355347, + "reward_std": 0.2089354693889618, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4013392925262451, + "rewards/curriculum_aware_reward_fn/std": 0.34986868500709534, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1103.0, + "completions/max_terminated_length": 1103.0, + "completions/mean_length": 497.8839416503906, + "completions/mean_terminated_length": 497.8839416503906, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 1.6345112200154759, + "grad_norm": 0.728714108467102, + "kl": 0.095947265625, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 185170837.0, + "reward": 1.2767857313156128, + "reward_std": 0.18643386662006378, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.2857142984867096, + "rewards/curriculum_aware_reward_fn/std": 0.3156167268753052, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1203.0, + "completions/max_terminated_length": 1203.0, + "completions/mean_length": 492.169677734375, + "completions/mean_terminated_length": 492.169677734375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 1.6355429455764767, + "grad_norm": 0.768435001373291, + "kl": 0.1011962890625, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 185288008.0, + "reward": 1.3888393640518188, + "reward_std": 0.17833693325519562, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3888392746448517, + "rewards/curriculum_aware_reward_fn/std": 0.4032922089099884, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 414.2946472167969, + "completions/mean_terminated_length": 414.2946472167969, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 1.6365746711374776, + "grad_norm": 0.8125675916671753, + "kl": 0.0985107421875, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 185392712.0, + "reward": 1.4598214626312256, + "reward_std": 0.20808333158493042, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4598214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.36612406373023987, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 509.2857360839844, + "completions/mean_terminated_length": 509.2857360839844, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 1.6376063966984782, + "grad_norm": 0.6974402666091919, + "kl": 0.087646484375, + "learning_rate": 1e-06, + "loss": 0.0205, + "num_tokens": 185516105.0, + "reward": 1.3656251430511475, + "reward_std": 0.17538422346115112, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3656249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.33678603172302246, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1294.0, + "completions/max_terminated_length": 1294.0, + "completions/mean_length": 506.169677734375, + "completions/mean_terminated_length": 506.169677734375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 1.638638122259479, + "grad_norm": 0.596748411655426, + "kl": 0.0869140625, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 185638848.0, + "reward": 1.3928571939468384, + "reward_std": 0.15835700929164886, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3928571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.38278597593307495, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 463.0625305175781, + "completions/mean_terminated_length": 463.0625305175781, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 1.6396698478204796, + "grad_norm": 0.8352671265602112, + "kl": 0.0986328125, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 185759059.0, + "reward": 1.3794643878936768, + "reward_std": 0.1755049079656601, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3794642984867096, + "rewards/curriculum_aware_reward_fn/std": 0.3583969175815582, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 947.0, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 441.96429443359375, + "completions/mean_terminated_length": 441.96429443359375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.6407015733814805, + "grad_norm": 0.7398374676704407, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 185873834.0, + "reward": 1.4803574085235596, + "reward_std": 0.16756568849086761, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48035717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3915410041809082, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 494.8839416503906, + "completions/mean_terminated_length": 494.8839416503906, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 1.6417332989424813, + "grad_norm": 0.6949180960655212, + "kl": 0.087646484375, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 185994039.0, + "reward": 1.3848215341567993, + "reward_std": 0.1573912799358368, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38482144474983215, + "rewards/curriculum_aware_reward_fn/std": 0.3977889120578766, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 416.5446472167969, + "completions/mean_terminated_length": 416.5446472167969, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.6427650245034822, + "grad_norm": 0.7075119018554688, + "kl": 0.10205078125, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 186105666.0, + "reward": 1.6562501192092896, + "reward_std": 0.17057617008686066, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.65625, + "rewards/curriculum_aware_reward_fn/std": 0.2933286130428314, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1830.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 479.8035888671875, + "completions/mean_terminated_length": 479.8035888671875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.643796750064483, + "grad_norm": 0.6478801369667053, + "kl": 0.084716796875, + "learning_rate": 1e-06, + "loss": 0.0262, + "num_tokens": 186223020.0, + "reward": 1.5705358982086182, + "reward_std": 0.1583946794271469, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5705357193946838, + "rewards/curriculum_aware_reward_fn/std": 0.37044858932495117, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 527.6875, + "completions/mean_terminated_length": 495.5405578613281, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 1.6448284756254836, + "grad_norm": 0.6908033490180969, + "kl": 0.0889892578125, + "learning_rate": 1e-06, + "loss": 0.0491, + "num_tokens": 186356573.0, + "reward": 1.3339287042617798, + "reward_std": 0.2090333253145218, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3469901978969574, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1386.0, + "completions/max_terminated_length": 1386.0, + "completions/mean_length": 470.2500305175781, + "completions/mean_terminated_length": 470.2500305175781, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.6458602011864842, + "grad_norm": 0.7876476645469666, + "kl": 0.093994140625, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 186474166.0, + "reward": 1.5098215341567993, + "reward_std": 0.2147691696882248, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5098214745521545, + "rewards/curriculum_aware_reward_fn/std": 0.35111403465270996, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 479.77679443359375, + "completions/mean_terminated_length": 479.77679443359375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 1.646891926747485, + "grad_norm": 0.6431217193603516, + "kl": 0.0885009765625, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 186595423.0, + "reward": 1.2848215103149414, + "reward_std": 0.1607237607240677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28482142090797424, + "rewards/curriculum_aware_reward_fn/std": 0.3498287498950958, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1157.0, + "completions/max_terminated_length": 1157.0, + "completions/mean_length": 553.7142944335938, + "completions/mean_terminated_length": 553.7142944335938, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 1.647923652308486, + "grad_norm": 0.7455097436904907, + "kl": 0.093505859375, + "learning_rate": 1e-06, + "loss": 0.0118, + "num_tokens": 186731765.0, + "reward": 1.4160715341567993, + "reward_std": 0.24613210558891296, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.36693274974823, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 470.107177734375, + "completions/mean_terminated_length": 470.107177734375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.6489553778694868, + "grad_norm": 0.8170499801635742, + "kl": 0.119384765625, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 186852681.0, + "reward": 1.3946430683135986, + "reward_std": 0.14803080260753632, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3946428596973419, + "rewards/curriculum_aware_reward_fn/std": 0.35869866609573364, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 520.357177734375, + "completions/mean_terminated_length": 520.357177734375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 1.6499871034304876, + "grad_norm": 0.853003203868866, + "kl": 0.084716796875, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 186971252.0, + "reward": 1.3031251430511475, + "reward_std": 0.25454646348953247, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3120535910129547, + "rewards/curriculum_aware_reward_fn/std": 0.3552214205265045, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 513.4732666015625, + "completions/mean_terminated_length": 513.4732666015625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 1.6510188289914882, + "grad_norm": 0.7009985446929932, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 187103297.0, + "reward": 1.3316963911056519, + "reward_std": 0.20880287885665894, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33169645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.4298541247844696, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2025.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 486.3750305175781, + "completions/mean_terminated_length": 486.3750305175781, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.652050554552489, + "grad_norm": 0.766791045665741, + "kl": 0.0958251953125, + "learning_rate": 1e-06, + "loss": 0.0189, + "num_tokens": 187217845.0, + "reward": 1.3294644355773926, + "reward_std": 0.23240506649017334, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32946428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3554944396018982, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 509.6250305175781, + "completions/mean_terminated_length": 509.6250305175781, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 1.6530822801134897, + "grad_norm": 0.7172535061836243, + "kl": 0.09619140625, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 187348857.0, + "reward": 1.3571429252624512, + "reward_std": 0.15233953297138214, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.38425412774086, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 497.482177734375, + "completions/mean_terminated_length": 497.482177734375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 1.6541140056744905, + "grad_norm": 0.7336816787719727, + "kl": 0.09521484375, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 187471383.0, + "reward": 1.4017857313156128, + "reward_std": 0.1852165162563324, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4017857015132904, + "rewards/curriculum_aware_reward_fn/std": 0.3883395493030548, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 486.1160888671875, + "completions/mean_terminated_length": 486.1160888671875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.6551457312354914, + "grad_norm": 0.7177641987800598, + "kl": 0.087890625, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 187597900.0, + "reward": 1.455357313156128, + "reward_std": 0.20527367293834686, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.455357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.33270373940467834, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 489.544677734375, + "completions/mean_terminated_length": 489.544677734375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 1.6561774567964922, + "grad_norm": 0.785552442073822, + "kl": 0.0997314453125, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 187715155.0, + "reward": 1.3361608982086182, + "reward_std": 0.20291408896446228, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.34508928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3495788872241974, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1128.0, + "completions/max_terminated_length": 1128.0, + "completions/mean_length": 488.6875305175781, + "completions/mean_terminated_length": 488.6875305175781, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 1.657209182357493, + "grad_norm": 0.7447234988212585, + "kl": 0.0999755859375, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 187839377.0, + "reward": 1.3928571939468384, + "reward_std": 0.14517280459403992, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.392857164144516, + "rewards/curriculum_aware_reward_fn/std": 0.35277900099754333, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 505.107177734375, + "completions/mean_terminated_length": 505.107177734375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 1.6582409079184937, + "grad_norm": 0.728800892829895, + "kl": 0.1026611328125, + "learning_rate": 1e-06, + "loss": 0.0157, + "num_tokens": 187971360.0, + "reward": 1.4093750715255737, + "reward_std": 0.2143835425376892, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40937501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.3755195438861847, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 566.8839721679688, + "completions/mean_terminated_length": 566.8839721679688, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.6592726334794945, + "grad_norm": 0.6726198196411133, + "kl": 0.0870361328125, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 188107098.0, + "reward": 1.3361608982086182, + "reward_std": 0.19342978298664093, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33616071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3774147629737854, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 958.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 474.39288330078125, + "completions/mean_terminated_length": 474.39288330078125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 1.6603043590404951, + "grad_norm": 0.73554927110672, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": -0.0131, + "num_tokens": 188224112.0, + "reward": 1.3187501430511475, + "reward_std": 0.14685951173305511, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3187499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3741130530834198, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1246.0, + "completions/max_terminated_length": 1246.0, + "completions/mean_length": 493.65179443359375, + "completions/mean_terminated_length": 493.65179443359375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 1.661336084601496, + "grad_norm": 0.7111493349075317, + "kl": 0.093017578125, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 188357325.0, + "reward": 1.2754465341567993, + "reward_std": 0.13824760913848877, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.27544644474983215, + "rewards/curriculum_aware_reward_fn/std": 0.3375852108001709, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 466.5000305175781, + "completions/mean_terminated_length": 466.5000305175781, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 1.6623678101624968, + "grad_norm": 0.7171905636787415, + "kl": 0.1085205078125, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 188475730.0, + "reward": 1.525892972946167, + "reward_std": 0.16963417828083038, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774, + "rewards/curriculum_aware_reward_fn/std": 0.40647807717323303, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 521.2589721679688, + "completions/mean_terminated_length": 521.2589721679688, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.6633995357234976, + "grad_norm": 0.6403311491012573, + "kl": 0.10107421875, + "learning_rate": 1e-06, + "loss": 0.0336, + "num_tokens": 188600779.0, + "reward": 1.3357144594192505, + "reward_std": 0.1563766449689865, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33571431040763855, + "rewards/curriculum_aware_reward_fn/std": 0.38720694184303284, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1240.0, + "completions/max_terminated_length": 1240.0, + "completions/mean_length": 537.3392944335938, + "completions/mean_terminated_length": 537.3392944335938, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 1.6644312612844985, + "grad_norm": 0.759138286113739, + "kl": 0.0963134765625, + "learning_rate": 1e-06, + "loss": 0.0245, + "num_tokens": 188727398.0, + "reward": 1.4004465341567993, + "reward_std": 0.14537791907787323, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40044641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.3470918834209442, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 533.875, + "completions/mean_terminated_length": 533.875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.665462986845499, + "grad_norm": 0.6733611822128296, + "kl": 0.0927734375, + "learning_rate": 1e-06, + "loss": 0.0243, + "num_tokens": 188860895.0, + "reward": 1.3901787996292114, + "reward_std": 0.13679122924804688, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3901785910129547, + "rewards/curriculum_aware_reward_fn/std": 0.35072892904281616, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 487.4464416503906, + "completions/mean_terminated_length": 487.4464416503906, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 1.6664947124064997, + "grad_norm": 0.7703580856323242, + "kl": 0.114013671875, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 188985426.0, + "reward": 1.38660728931427, + "reward_std": 0.13961534202098846, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38660717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3742850422859192, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 562.6785888671875, + "completions/mean_terminated_length": 562.6785888671875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.6675264379675006, + "grad_norm": 0.5722839832305908, + "kl": 0.0897216796875, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 189108982.0, + "reward": 1.3946430683135986, + "reward_std": 0.16097982227802277, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3946428596973419, + "rewards/curriculum_aware_reward_fn/std": 0.39708155393600464, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1191.0, + "completions/max_terminated_length": 1191.0, + "completions/mean_length": 529.4285888671875, + "completions/mean_terminated_length": 529.4285888671875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.6685581635285014, + "grad_norm": 0.7239696979522705, + "kl": 0.0936279296875, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 189234862.0, + "reward": 1.3558037281036377, + "reward_std": 0.16733594238758087, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35580357909202576, + "rewards/curriculum_aware_reward_fn/std": 0.33582931756973267, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 476.4464416503906, + "completions/mean_terminated_length": 476.4464416503906, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 1.6695898890895022, + "grad_norm": 0.7931858897209167, + "kl": 0.1002197265625, + "learning_rate": 1e-06, + "loss": 0.0144, + "num_tokens": 189360758.0, + "reward": 1.339285969734192, + "reward_std": 0.21934357285499573, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.37637779116630554, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 512.1964721679688, + "completions/mean_terminated_length": 512.1964721679688, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 1.670621614650503, + "grad_norm": 0.7632045745849609, + "kl": 0.095703125, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 189487242.0, + "reward": 1.345089316368103, + "reward_std": 0.18178538978099823, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34508928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3602401316165924, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 471.14288330078125, + "completions/mean_terminated_length": 471.14288330078125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.6716533402115037, + "grad_norm": 1.4711685180664062, + "kl": 0.1971435546875, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 189603591.0, + "reward": 1.3727679252624512, + "reward_std": 0.17584413290023804, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3816964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.38782110810279846, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 472.14288330078125, + "completions/mean_terminated_length": 472.14288330078125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 1.6726850657725045, + "grad_norm": 0.7945736646652222, + "kl": 0.101318359375, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 189716979.0, + "reward": 1.5071431398391724, + "reward_std": 0.18330030143260956, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5071429014205933, + "rewards/curriculum_aware_reward_fn/std": 0.37643763422966003, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1087.0, + "completions/max_terminated_length": 1087.0, + "completions/mean_length": 459.0357360839844, + "completions/mean_terminated_length": 459.0357360839844, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 1.6737167913335051, + "grad_norm": 0.8028322458267212, + "kl": 0.11181640625, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 189832028.0, + "reward": 1.4718750715255737, + "reward_std": 0.18403606116771698, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47187501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.39385947585105896, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 522.107177734375, + "completions/mean_terminated_length": 522.107177734375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.674748516894506, + "grad_norm": 0.6720342040061951, + "kl": 0.1004638671875, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 189957487.0, + "reward": 1.3482143878936768, + "reward_std": 0.1303005814552307, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3571428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.34989428520202637, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 515.4910888671875, + "completions/mean_terminated_length": 515.4910888671875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 1.6757802424555068, + "grad_norm": 0.6873416900634766, + "kl": 0.1005859375, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 190071397.0, + "reward": 1.356250286102295, + "reward_std": 0.12795774638652802, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.37830376625061035, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1272.0, + "completions/max_terminated_length": 1272.0, + "completions/mean_length": 505.7589416503906, + "completions/mean_terminated_length": 505.7589416503906, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 1.6768119680165077, + "grad_norm": 0.8274422883987427, + "kl": 0.10498046875, + "learning_rate": 1e-06, + "loss": 0.0366, + "num_tokens": 190203222.0, + "reward": 1.3004465103149414, + "reward_std": 0.2362377792596817, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30044645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.3548566401004791, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 455.45538330078125, + "completions/mean_terminated_length": 455.45538330078125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.6778436935775085, + "grad_norm": 0.5675816535949707, + "kl": 0.095703125, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 190322679.0, + "reward": 1.4080358743667603, + "reward_std": 0.09176620095968246, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4080357253551483, + "rewards/curriculum_aware_reward_fn/std": 0.38088053464889526, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 488.294677734375, + "completions/mean_terminated_length": 488.294677734375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 1.6788754191385091, + "grad_norm": 0.7279961705207825, + "kl": 0.0906982421875, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 190439395.0, + "reward": 1.5200895071029663, + "reward_std": 0.1635468453168869, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5200892686843872, + "rewards/curriculum_aware_reward_fn/std": 0.37208083271980286, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1001.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 495.919677734375, + "completions/mean_terminated_length": 495.919677734375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 1.67990714469951, + "grad_norm": 0.8204648494720459, + "kl": 0.106201171875, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 190565743.0, + "reward": 1.254910945892334, + "reward_std": 0.21975882351398468, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2549107074737549, + "rewards/curriculum_aware_reward_fn/std": 0.31785881519317627, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 484.5000305175781, + "completions/mean_terminated_length": 484.5000305175781, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 1.6809388702605106, + "grad_norm": 0.6664847731590271, + "kl": 0.099853515625, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 190682951.0, + "reward": 1.4169644117355347, + "reward_std": 0.10810566693544388, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41696426272392273, + "rewards/curriculum_aware_reward_fn/std": 0.3685590326786041, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 532.7410888671875, + "completions/mean_terminated_length": 532.7410888671875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.6819705958215114, + "grad_norm": 0.7823235392570496, + "kl": 0.1058349609375, + "learning_rate": 1e-06, + "loss": 0.0251, + "num_tokens": 190816524.0, + "reward": 1.2272322177886963, + "reward_std": 0.1962169110774994, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2272321730852127, + "rewards/curriculum_aware_reward_fn/std": 0.28179672360420227, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1198.0, + "completions/max_terminated_length": 1198.0, + "completions/mean_length": 531.5892944335938, + "completions/mean_terminated_length": 531.5892944335938, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 1.6830023213825123, + "grad_norm": 0.5920828580856323, + "kl": 0.0965576171875, + "learning_rate": 1e-06, + "loss": -0.0145, + "num_tokens": 190947917.0, + "reward": 1.3424108028411865, + "reward_std": 0.1325589269399643, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34241074323654175, + "rewards/curriculum_aware_reward_fn/std": 0.3899185359477997, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 550.6964721679688, + "completions/mean_terminated_length": 550.6964721679688, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 1.684034046943513, + "grad_norm": 0.7132331132888794, + "kl": 0.0924072265625, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 191081415.0, + "reward": 1.1437500715255737, + "reward_std": 0.15005722641944885, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.2836478054523468, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 489.6964416503906, + "completions/mean_terminated_length": 489.6964416503906, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.685065772504514, + "grad_norm": 0.7506689429283142, + "kl": 0.10888671875, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 191205764.0, + "reward": 1.3625000715255737, + "reward_std": 0.1464381217956543, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36250001192092896, + "rewards/curriculum_aware_reward_fn/std": 0.371416836977005, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 543.732177734375, + "completions/mean_terminated_length": 511.729736328125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 1.6860974980655146, + "grad_norm": 0.7990245819091797, + "kl": 0.0947265625, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 191333218.0, + "reward": 1.4200894832611084, + "reward_std": 0.2204861342906952, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4290178716182709, + "rewards/curriculum_aware_reward_fn/std": 0.306508868932724, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1108.0, + "completions/max_terminated_length": 1108.0, + "completions/mean_length": 557.4107666015625, + "completions/mean_terminated_length": 557.4107666015625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 1.6871292236265152, + "grad_norm": 0.6546155214309692, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 191460573.0, + "reward": 1.2379465103149414, + "reward_std": 0.14424559473991394, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.24687500298023224, + "rewards/curriculum_aware_reward_fn/std": 0.3062620759010315, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 470.58929443359375, + "completions/mean_terminated_length": 470.58929443359375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 1.688160949187516, + "grad_norm": 0.8744813799858093, + "kl": 0.1126708984375, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 191581424.0, + "reward": 1.4120537042617798, + "reward_std": 0.21662786602973938, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3654101490974426, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 504.5535888671875, + "completions/mean_terminated_length": 504.5535888671875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 1.6891926747485169, + "grad_norm": 0.8263741731643677, + "kl": 0.0985107421875, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 191711812.0, + "reward": 1.427232265472412, + "reward_std": 0.23317548632621765, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43616074323654175, + "rewards/curriculum_aware_reward_fn/std": 0.38439202308654785, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 491.7589416503906, + "completions/mean_terminated_length": 491.7589416503906, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.6902244003095177, + "grad_norm": 0.8051194548606873, + "kl": 0.1053466796875, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 191836329.0, + "reward": 1.4125001430511475, + "reward_std": 0.20308110117912292, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4125000536441803, + "rewards/curriculum_aware_reward_fn/std": 0.388430655002594, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 459.4107360839844, + "completions/mean_terminated_length": 459.4107360839844, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.6912561258705185, + "grad_norm": 0.798630952835083, + "kl": 0.105712890625, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 191952148.0, + "reward": 1.409821629524231, + "reward_std": 0.13976800441741943, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40982145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.3854820132255554, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 525.3482666015625, + "completions/mean_terminated_length": 525.3482666015625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 1.6922878514315192, + "grad_norm": 0.8330475091934204, + "kl": 0.09716796875, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 192074351.0, + "reward": 1.3924108743667603, + "reward_std": 0.18537965416908264, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3924107253551483, + "rewards/curriculum_aware_reward_fn/std": 0.3159577250480652, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1070.0, + "completions/max_terminated_length": 1070.0, + "completions/mean_length": 478.0982360839844, + "completions/mean_terminated_length": 478.0982360839844, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 1.69331957699252, + "grad_norm": 0.7114593982696533, + "kl": 0.1033935546875, + "learning_rate": 1e-06, + "loss": -0.0304, + "num_tokens": 192190296.0, + "reward": 1.427232265472412, + "reward_std": 0.20058543980121613, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42723217606544495, + "rewards/curriculum_aware_reward_fn/std": 0.40569642186164856, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 523.6160888671875, + "completions/mean_terminated_length": 523.6160888671875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 1.6943513025535206, + "grad_norm": 0.7585659027099609, + "kl": 0.1019287109375, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 192312333.0, + "reward": 1.2558037042617798, + "reward_std": 0.15198412537574768, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25580358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3548724949359894, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 448.64288330078125, + "completions/mean_terminated_length": 448.64288330078125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 1.6953830281145215, + "grad_norm": 0.7520537972450256, + "kl": 0.0989990234375, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 192423488.0, + "reward": 1.411160945892334, + "reward_std": 0.17044828832149506, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4111607074737549, + "rewards/curriculum_aware_reward_fn/std": 0.3557576537132263, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 518.9553833007812, + "completions/mean_terminated_length": 518.9553833007812, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 1.6964147536755223, + "grad_norm": 0.7355285286903381, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 192540824.0, + "reward": 1.3866074085235596, + "reward_std": 0.22177164256572723, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581, + "rewards/curriculum_aware_reward_fn/std": 0.36619433760643005, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1186.0, + "completions/max_terminated_length": 1186.0, + "completions/mean_length": 522.9375, + "completions/mean_terminated_length": 522.9375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.6974464792365231, + "grad_norm": 0.7334874868392944, + "kl": 0.103515625, + "learning_rate": 1e-06, + "loss": 0.0423, + "num_tokens": 192679627.0, + "reward": 1.321428656578064, + "reward_std": 0.20126299560070038, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3214285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.34095218777656555, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 515.5535888671875, + "completions/mean_terminated_length": 515.5535888671875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 1.698478204797524, + "grad_norm": 0.496624231338501, + "kl": 0.0982666015625, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 192801223.0, + "reward": 1.3687502145767212, + "reward_std": 0.09063854068517685, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36875003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.3997817933559418, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 521.7232666015625, + "completions/mean_terminated_length": 521.7232666015625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.6995099303585246, + "grad_norm": 0.7509146928787231, + "kl": 0.110107421875, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 192937356.0, + "reward": 1.3142858743667603, + "reward_std": 0.18662287294864655, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3142856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.35459840297698975, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 476.83929443359375, + "completions/mean_terminated_length": 476.83929443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 1.7005416559195254, + "grad_norm": 0.6993504762649536, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 193053902.0, + "reward": 1.3772321939468384, + "reward_std": 0.12742431461811066, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3772321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.37781307101249695, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 534.375, + "completions/mean_terminated_length": 534.375, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 1.701573381480526, + "grad_norm": 0.7881371378898621, + "kl": 0.1044921875, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 193184795.0, + "reward": 1.296875, + "reward_std": 0.21468262374401093, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3058035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.361537367105484, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 526.9464721679688, + "completions/mean_terminated_length": 526.9464721679688, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 1.702605107041527, + "grad_norm": 0.7941908240318298, + "kl": 0.1002197265625, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 193312581.0, + "reward": 1.2870537042617798, + "reward_std": 0.20489533245563507, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28705358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.36222195625305176, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1183.0, + "completions/max_terminated_length": 1183.0, + "completions/mean_length": 491.9285888671875, + "completions/mean_terminated_length": 491.9285888671875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 1.7036368326025277, + "grad_norm": 0.6850597858428955, + "kl": 0.10009765625, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 193435297.0, + "reward": 1.3665181398391724, + "reward_std": 0.16435275971889496, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3665178716182709, + "rewards/curriculum_aware_reward_fn/std": 0.3578205108642578, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 487.08929443359375, + "completions/mean_terminated_length": 487.08929443359375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.7046685581635286, + "grad_norm": 0.7362051010131836, + "kl": 0.104736328125, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 193551976.0, + "reward": 1.3866074085235596, + "reward_std": 0.21596673130989075, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581, + "rewards/curriculum_aware_reward_fn/std": 0.4046454429626465, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 499.9910888671875, + "completions/mean_terminated_length": 499.9910888671875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 1.7057002837245294, + "grad_norm": 0.7955490946769714, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": -0.0096, + "num_tokens": 193674076.0, + "reward": 1.3584822416305542, + "reward_std": 0.14499978721141815, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35848215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.3501536548137665, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 485.294677734375, + "completions/mean_terminated_length": 485.294677734375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.70673200928553, + "grad_norm": 0.7266811728477478, + "kl": 0.111572265625, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 193794614.0, + "reward": 1.4517858028411865, + "reward_std": 0.1658746898174286, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45178574323654175, + "rewards/curriculum_aware_reward_fn/std": 0.38891908526420593, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 456.0089416503906, + "completions/mean_terminated_length": 456.0089416503906, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 1.7077637348465307, + "grad_norm": 0.7519125938415527, + "kl": 0.1019287109375, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 193909450.0, + "reward": 1.466071605682373, + "reward_std": 0.15212328732013702, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4660714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.3495952785015106, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1129.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 532.4375, + "completions/mean_terminated_length": 532.4375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 1.7087954604075315, + "grad_norm": 0.8475361466407776, + "kl": 0.106689453125, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 194028863.0, + "reward": 1.3598215579986572, + "reward_std": 0.18619807064533234, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35982146859169006, + "rewards/curriculum_aware_reward_fn/std": 0.3328717052936554, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 447.1607360839844, + "completions/mean_terminated_length": 447.1607360839844, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 1.7098271859685323, + "grad_norm": 0.8995668888092041, + "kl": 0.1075439453125, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 194141168.0, + "reward": 1.4339287281036377, + "reward_std": 0.20498405396938324, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.44285720586776733, + "rewards/curriculum_aware_reward_fn/std": 0.36129504442214966, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 471.6339416503906, + "completions/mean_terminated_length": 471.6339416503906, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 1.7108589115295332, + "grad_norm": 0.7614728808403015, + "kl": 0.110107421875, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 194269150.0, + "reward": 1.3857144117355347, + "reward_std": 0.16251114010810852, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3857143223285675, + "rewards/curriculum_aware_reward_fn/std": 0.3841703534126282, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 499.482177734375, + "completions/mean_terminated_length": 499.482177734375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 1.711890637090534, + "grad_norm": 0.6195462942123413, + "kl": 0.1011962890625, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 194391410.0, + "reward": 1.2531250715255737, + "reward_std": 0.1709987223148346, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25312501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.35088568925857544, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 447.419677734375, + "completions/mean_terminated_length": 447.419677734375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 1.7129223626515349, + "grad_norm": 0.8020883202552795, + "kl": 0.115234375, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 194509547.0, + "reward": 1.5098215341567993, + "reward_std": 0.1900186985731125, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5098214149475098, + "rewards/curriculum_aware_reward_fn/std": 0.36778125166893005, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1608.0, + "completions/max_terminated_length": 1608.0, + "completions/mean_length": 494.919677734375, + "completions/mean_terminated_length": 494.919677734375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 1.7139540882125355, + "grad_norm": 0.7013669013977051, + "kl": 0.1051025390625, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 194628699.0, + "reward": 1.3687500953674316, + "reward_std": 0.1604437232017517, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36875003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.37326928973197937, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 433.6339416503906, + "completions/mean_terminated_length": 433.6339416503906, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.714985813773536, + "grad_norm": 0.7422285079956055, + "kl": 0.1162109375, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 194741211.0, + "reward": 1.463392972946167, + "reward_std": 0.17094425857067108, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46339288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3927830159664154, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1277.0, + "completions/max_terminated_length": 1277.0, + "completions/mean_length": 504.6964416503906, + "completions/mean_terminated_length": 504.6964416503906, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 1.716017539334537, + "grad_norm": 0.6004372835159302, + "kl": 0.104736328125, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 194861010.0, + "reward": 1.4325894117355347, + "reward_std": 0.11247504502534866, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4325892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.48331207036972046, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 462.1339416503906, + "completions/mean_terminated_length": 462.1339416503906, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 1.7170492648955378, + "grad_norm": 0.8087993264198303, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.0189, + "num_tokens": 194970512.0, + "reward": 1.4084821939468384, + "reward_std": 0.19191642105579376, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4084821343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3480245769023895, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1514.0, + "completions/max_terminated_length": 1514.0, + "completions/mean_length": 529.294677734375, + "completions/mean_terminated_length": 529.294677734375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 1.7180809904565386, + "grad_norm": 0.6965283751487732, + "kl": 0.09814453125, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 195099016.0, + "reward": 1.3593751192092896, + "reward_std": 0.1859753429889679, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3683035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.3423551023006439, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 420.46429443359375, + "completions/mean_terminated_length": 420.46429443359375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 1.7191127160175395, + "grad_norm": 0.681612491607666, + "kl": 0.1048583984375, + "learning_rate": 1e-06, + "loss": 0.0202, + "num_tokens": 195206938.0, + "reward": 1.4933037757873535, + "reward_std": 0.11284186691045761, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4933035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.397089421749115, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 480.1607360839844, + "completions/mean_terminated_length": 480.1607360839844, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 1.72014444157854, + "grad_norm": 0.8302071690559387, + "kl": 0.1201171875, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 195328749.0, + "reward": 1.387946605682373, + "reward_std": 0.1679256558418274, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3879464268684387, + "rewards/curriculum_aware_reward_fn/std": 0.37273311614990234, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1076.0, + "completions/max_terminated_length": 1076.0, + "completions/mean_length": 509.7232360839844, + "completions/mean_terminated_length": 509.7232360839844, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 1.721176167139541, + "grad_norm": 0.7681064009666443, + "kl": 0.0999755859375, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 195457798.0, + "reward": 1.282142996788025, + "reward_std": 0.16930948197841644, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28214287757873535, + "rewards/curriculum_aware_reward_fn/std": 0.3344254791736603, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 456.1160888671875, + "completions/mean_terminated_length": 456.1160888671875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 1.7222078927005415, + "grad_norm": 0.8119245767593384, + "kl": 0.0977783203125, + "learning_rate": 1e-06, + "loss": -0.0156, + "num_tokens": 195581025.0, + "reward": 1.4183037281036377, + "reward_std": 0.21370477974414825, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41830357909202576, + "rewards/curriculum_aware_reward_fn/std": 0.37156811356544495, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 474.7589416503906, + "completions/mean_terminated_length": 474.7589416503906, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 1.7232396182615424, + "grad_norm": 0.7375963926315308, + "kl": 0.1015625, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 195695749.0, + "reward": 1.3129465579986572, + "reward_std": 0.15211425721645355, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3129464089870453, + "rewards/curriculum_aware_reward_fn/std": 0.35947567224502563, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 494.15179443359375, + "completions/mean_terminated_length": 494.15179443359375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 1.7242713438225432, + "grad_norm": 0.7717375159263611, + "kl": 0.105224609375, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 195822375.0, + "reward": 1.2986608743667603, + "reward_std": 0.146214559674263, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2986607551574707, + "rewards/curriculum_aware_reward_fn/std": 0.3745543956756592, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 509.8839416503906, + "completions/mean_terminated_length": 509.8839416503906, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.725303069383544, + "grad_norm": 0.7136673331260681, + "kl": 0.108154296875, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 195945226.0, + "reward": 1.2910715341567993, + "reward_std": 0.14577467739582062, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29107144474983215, + "rewards/curriculum_aware_reward_fn/std": 0.35937973856925964, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 481.5714416503906, + "completions/mean_terminated_length": 481.5714416503906, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 1.726334794944545, + "grad_norm": 0.7931310534477234, + "kl": 0.1044921875, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 196066859.0, + "reward": 1.401785969734192, + "reward_std": 0.2273138165473938, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096, + "rewards/curriculum_aware_reward_fn/std": 0.3675110340118408, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 450.6160888671875, + "completions/mean_terminated_length": 450.6160888671875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 1.7273665205055455, + "grad_norm": 0.7282100319862366, + "kl": 0.10791015625, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 196189491.0, + "reward": 1.302232265472412, + "reward_std": 0.1299796998500824, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30223211646080017, + "rewards/curriculum_aware_reward_fn/std": 0.36769235134124756, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1093.0, + "completions/max_terminated_length": 1093.0, + "completions/mean_length": 511.732177734375, + "completions/mean_terminated_length": 511.732177734375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 1.7283982460665464, + "grad_norm": 0.7027313113212585, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 196314775.0, + "reward": 1.3165180683135986, + "reward_std": 0.11481791734695435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3165178596973419, + "rewards/curriculum_aware_reward_fn/std": 0.38264068961143494, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 433.8214416503906, + "completions/mean_terminated_length": 433.8214416503906, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 1.729429971627547, + "grad_norm": 0.7571617364883423, + "kl": 0.105712890625, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 196425103.0, + "reward": 1.4602679014205933, + "reward_std": 0.2094980925321579, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4602678716182709, + "rewards/curriculum_aware_reward_fn/std": 0.37248486280441284, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1158.0, + "completions/max_terminated_length": 1158.0, + "completions/mean_length": 525.3392944335938, + "completions/mean_terminated_length": 525.3392944335938, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 1.7304616971885478, + "grad_norm": 0.6972119808197021, + "kl": 0.093505859375, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 196566989.0, + "reward": 1.224107265472412, + "reward_std": 0.17237694561481476, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.22410713136196136, + "rewards/curriculum_aware_reward_fn/std": 0.33402958512306213, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 448.7589416503906, + "completions/mean_terminated_length": 448.7589416503906, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 1.7314934227495486, + "grad_norm": 0.7930586338043213, + "kl": 0.1019287109375, + "learning_rate": 1e-06, + "loss": 0.0157, + "num_tokens": 196675343.0, + "reward": 1.4683037996292114, + "reward_std": 0.18954959511756897, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4683036208152771, + "rewards/curriculum_aware_reward_fn/std": 0.3429466784000397, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 475.7589416503906, + "completions/mean_terminated_length": 475.7589416503906, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 1.7325251483105495, + "grad_norm": 0.736160159111023, + "kl": 0.111083984375, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 196793168.0, + "reward": 1.3767858743667603, + "reward_std": 0.22271791100502014, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3767857253551483, + "rewards/curriculum_aware_reward_fn/std": 0.4599710702896118, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 444.544677734375, + "completions/mean_terminated_length": 444.544677734375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.7335568738715503, + "grad_norm": 0.7676131129264832, + "kl": 0.0968017578125, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 196910710.0, + "reward": 1.5258928537368774, + "reward_std": 0.13285356760025024, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774, + "rewards/curriculum_aware_reward_fn/std": 0.35131555795669556, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 484.9732360839844, + "completions/mean_terminated_length": 484.9732360839844, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.734588599432551, + "grad_norm": 0.729081392288208, + "kl": 0.115478515625, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 197032192.0, + "reward": 1.235267996788025, + "reward_std": 0.1558685004711151, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23526786267757416, + "rewards/curriculum_aware_reward_fn/std": 0.31755247712135315, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 467.9732360839844, + "completions/mean_terminated_length": 467.9732360839844, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.7356203249935516, + "grad_norm": 0.7132700085639954, + "kl": 0.1109619140625, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 197146324.0, + "reward": 1.4147323369979858, + "reward_std": 0.15210282802581787, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4147321283817291, + "rewards/curriculum_aware_reward_fn/std": 0.4016653597354889, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 467.4464416503906, + "completions/mean_terminated_length": 467.4464416503906, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 1.7366520505545524, + "grad_norm": 0.8161846995353699, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 197265668.0, + "reward": 1.3549107313156128, + "reward_std": 0.20558597147464752, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3638392984867096, + "rewards/curriculum_aware_reward_fn/std": 0.35515573620796204, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 518.3839721679688, + "completions/mean_terminated_length": 518.3839721679688, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 1.7376837761155532, + "grad_norm": 0.7936047911643982, + "kl": 0.102294921875, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 197393664.0, + "reward": 1.4267858266830444, + "reward_std": 0.20954875648021698, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42678573727607727, + "rewards/curriculum_aware_reward_fn/std": 0.3882235586643219, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 530.5625, + "completions/mean_terminated_length": 530.5625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 1.738715501676554, + "grad_norm": 0.6724871397018433, + "kl": 0.0965576171875, + "learning_rate": 1e-06, + "loss": 0.0118, + "num_tokens": 197532005.0, + "reward": 1.2790179252624512, + "reward_std": 0.17441387474536896, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2790178656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3681361675262451, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 447.9107360839844, + "completions/mean_terminated_length": 447.9107360839844, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 1.739747227237555, + "grad_norm": 0.6834338903427124, + "kl": 0.1043701171875, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 197650050.0, + "reward": 1.4276787042617798, + "reward_std": 0.15436527132987976, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3626006841659546, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1346.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 449.4732360839844, + "completions/mean_terminated_length": 449.4732360839844, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 1.7407789527985555, + "grad_norm": 0.8147615194320679, + "kl": 0.0997314453125, + "learning_rate": 1e-06, + "loss": -0.027, + "num_tokens": 197770629.0, + "reward": 1.454017996788025, + "reward_std": 0.1792232096195221, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.46294644474983215, + "rewards/curriculum_aware_reward_fn/std": 0.3991965651512146, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 527.3482666015625, + "completions/mean_terminated_length": 527.3482666015625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.7418106783595564, + "grad_norm": 1.57326078414917, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 197906272.0, + "reward": 1.1660715341567993, + "reward_std": 0.18589244782924652, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.17499999701976776, + "rewards/curriculum_aware_reward_fn/std": 0.27963942289352417, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 441.33929443359375, + "completions/mean_terminated_length": 441.33929443359375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.742842403920557, + "grad_norm": 0.6428574323654175, + "kl": 0.1011962890625, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 198026703.0, + "reward": 1.3812501430511475, + "reward_std": 0.13791553676128387, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3812500536441803, + "rewards/curriculum_aware_reward_fn/std": 0.3720000386238098, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 483.6785888671875, + "completions/mean_terminated_length": 483.6785888671875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 1.7438741294815578, + "grad_norm": 0.6593337655067444, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 198157714.0, + "reward": 1.2169643640518188, + "reward_std": 0.09230455011129379, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.21696427464485168, + "rewards/curriculum_aware_reward_fn/std": 0.33054375648498535, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 459.4285888671875, + "completions/mean_terminated_length": 459.4285888671875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.7449058550425587, + "grad_norm": 0.5632081627845764, + "kl": 0.0977783203125, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 198276083.0, + "reward": 1.3674108982086182, + "reward_std": 0.1574571579694748, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3781004250049591, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 975.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 487.5000305175781, + "completions/mean_terminated_length": 487.5000305175781, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 1.7459375806035595, + "grad_norm": 0.7337881326675415, + "kl": 0.09326171875, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 198399990.0, + "reward": 1.2334822416305542, + "reward_std": 0.1932225376367569, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.23348215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.3446102738380432, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 471.669677734375, + "completions/mean_terminated_length": 471.669677734375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 1.7469693061645604, + "grad_norm": 0.7575318217277527, + "kl": 0.1068115234375, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 198517336.0, + "reward": 1.3642858266830444, + "reward_std": 0.20217333734035492, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36428573727607727, + "rewards/curriculum_aware_reward_fn/std": 0.33887922763824463, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1079.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 470.982177734375, + "completions/mean_terminated_length": 470.982177734375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 1.748001031725561, + "grad_norm": 0.7248097658157349, + "kl": 0.10400390625, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 198636180.0, + "reward": 1.497321605682373, + "reward_std": 0.10733328759670258, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4973214566707611, + "rewards/curriculum_aware_reward_fn/std": 0.35570254921913147, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 454.6875305175781, + "completions/mean_terminated_length": 454.6875305175781, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.7490327572865618, + "grad_norm": 0.7512885332107544, + "kl": 0.0963134765625, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 198759184.0, + "reward": 1.4040179252624512, + "reward_std": 0.15522757172584534, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4040178954601288, + "rewards/curriculum_aware_reward_fn/std": 0.3663269579410553, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1483.0, + "completions/max_terminated_length": 1483.0, + "completions/mean_length": 470.76788330078125, + "completions/mean_terminated_length": 470.76788330078125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 1.7500644828475624, + "grad_norm": 0.7817060947418213, + "kl": 0.095458984375, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 198878935.0, + "reward": 1.3401787281036377, + "reward_std": 0.16438253223896027, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34017854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.37161925435066223, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 430.8660888671875, + "completions/mean_terminated_length": 430.8660888671875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 1.7510962084085633, + "grad_norm": 0.8149338960647583, + "kl": 0.1177978515625, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 198989807.0, + "reward": 1.4232144355773926, + "reward_std": 0.23195189237594604, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42321428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.39346742630004883, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 464.3214416503906, + "completions/mean_terminated_length": 464.3214416503906, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 1.7521279339695641, + "grad_norm": 0.6136691570281982, + "kl": 0.0950927734375, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 199112049.0, + "reward": 1.35535728931427, + "reward_std": 0.14693090319633484, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3642857074737549, + "rewards/curriculum_aware_reward_fn/std": 0.4077199101448059, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 461.1250305175781, + "completions/mean_terminated_length": 461.1250305175781, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 1.753159659530565, + "grad_norm": 0.8194558620452881, + "kl": 0.1011962890625, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 199228557.0, + "reward": 1.4160715341567993, + "reward_std": 0.21462929248809814, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.42500004172325134, + "rewards/curriculum_aware_reward_fn/std": 0.3724765181541443, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 444.8035888671875, + "completions/mean_terminated_length": 444.8035888671875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.7541913850915658, + "grad_norm": 0.6593208312988281, + "kl": 0.09814453125, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 199351280.0, + "reward": 1.3517858982086182, + "reward_std": 0.1568783074617386, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.4667275846004486, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 466.8660888671875, + "completions/mean_terminated_length": 466.8660888671875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.7552231106525664, + "grad_norm": 0.4401684105396271, + "kl": 0.103759765625, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 199466744.0, + "reward": 1.2950893640518188, + "reward_std": 0.07254636287689209, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2950893044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3630427122116089, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 459.14288330078125, + "completions/mean_terminated_length": 459.14288330078125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 1.756254836213567, + "grad_norm": 0.6597514748573303, + "kl": 0.1014404296875, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 199580958.0, + "reward": 1.3616071939468384, + "reward_std": 0.12213817983865738, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3616071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.35163596272468567, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 442.02679443359375, + "completions/mean_terminated_length": 442.02679443359375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.7572865617745679, + "grad_norm": 0.6738682985305786, + "kl": 0.0911865234375, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 199695341.0, + "reward": 1.3625000715255737, + "reward_std": 0.11475715041160583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3777303099632263, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 443.419677734375, + "completions/mean_terminated_length": 443.419677734375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.7583182873355687, + "grad_norm": 0.6813015341758728, + "kl": 0.100830078125, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 199806457.0, + "reward": 1.3517858982086182, + "reward_std": 0.1438947170972824, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3782920837402344, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 547.0892944335938, + "completions/mean_terminated_length": 515.1171264648438, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 1.7593500128965696, + "grad_norm": 0.7241870164871216, + "kl": 0.0975341796875, + "learning_rate": 1e-06, + "loss": 0.0523, + "num_tokens": 199945123.0, + "reward": 1.2933037281036377, + "reward_std": 0.21056966483592987, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.30223217606544495, + "rewards/curriculum_aware_reward_fn/std": 0.3545958995819092, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1118.0, + "completions/max_terminated_length": 1118.0, + "completions/mean_length": 471.0000305175781, + "completions/mean_terminated_length": 471.0000305175781, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 1.7603817384575704, + "grad_norm": 0.7351222038269043, + "kl": 0.1163330078125, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 200069513.0, + "reward": 1.4196429252624512, + "reward_std": 0.13800209760665894, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4196428954601288, + "rewards/curriculum_aware_reward_fn/std": 0.352874755859375, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 425.26788330078125, + "completions/mean_terminated_length": 425.26788330078125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 1.761413464018571, + "grad_norm": 0.7676159739494324, + "kl": 0.100830078125, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 200193380.0, + "reward": 1.4200894832611084, + "reward_std": 0.16104407608509064, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4200893044471741, + "rewards/curriculum_aware_reward_fn/std": 0.36560821533203125, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1157.0, + "completions/max_terminated_length": 1157.0, + "completions/mean_length": 460.2589416503906, + "completions/mean_terminated_length": 460.2589416503906, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.7624451895795719, + "grad_norm": 0.7323415279388428, + "kl": 0.1181640625, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 200316273.0, + "reward": 1.2410714626312256, + "reward_std": 0.17039266228675842, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.3420262932777405, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1358.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 471.8125305175781, + "completions/mean_terminated_length": 471.8125305175781, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 1.7634769151405725, + "grad_norm": 0.5964041352272034, + "kl": 0.1002197265625, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 200441720.0, + "reward": 1.3169643878936768, + "reward_std": 0.09356345236301422, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3169642984867096, + "rewards/curriculum_aware_reward_fn/std": 0.34416329860687256, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 411.6339416503906, + "completions/mean_terminated_length": 411.6339416503906, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.7645086407015733, + "grad_norm": 0.757871150970459, + "kl": 0.0986328125, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 200553826.0, + "reward": 1.4549108743667603, + "reward_std": 0.22730287909507751, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4549107253551483, + "rewards/curriculum_aware_reward_fn/std": 0.40263545513153076, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 438.45538330078125, + "completions/mean_terminated_length": 438.45538330078125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.7655403662625742, + "grad_norm": 0.7597662210464478, + "kl": 0.0989990234375, + "learning_rate": 1e-06, + "loss": 0.0229, + "num_tokens": 200674648.0, + "reward": 1.439732313156128, + "reward_std": 0.16961470246315002, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4397321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3784831464290619, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 470.4375305175781, + "completions/mean_terminated_length": 470.4375305175781, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 1.766572091823575, + "grad_norm": 0.6943768858909607, + "kl": 0.098388671875, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 200801663.0, + "reward": 1.3165180683135986, + "reward_std": 0.18066294491291046, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3165178894996643, + "rewards/curriculum_aware_reward_fn/std": 0.34493690729141235, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 457.0089416503906, + "completions/mean_terminated_length": 457.0089416503906, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.7676038173845758, + "grad_norm": 0.6977770328521729, + "kl": 0.0997314453125, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 200923107.0, + "reward": 1.387946605682373, + "reward_std": 0.19864727556705475, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3879464566707611, + "rewards/curriculum_aware_reward_fn/std": 0.3827499747276306, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 439.044677734375, + "completions/mean_terminated_length": 439.044677734375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 1.7686355429455765, + "grad_norm": 0.807112991809845, + "kl": 0.107177734375, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 201039802.0, + "reward": 1.4120537042617798, + "reward_std": 0.1486327052116394, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41205358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.37562233209609985, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 377.1785888671875, + "completions/mean_terminated_length": 377.1785888671875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 1.7696672685065773, + "grad_norm": 0.7447263598442078, + "kl": 0.0958251953125, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 201139558.0, + "reward": 1.4366072416305542, + "reward_std": 0.16780443489551544, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.37734147906303406, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 422.9732360839844, + "completions/mean_terminated_length": 422.9732360839844, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 1.770698994067578, + "grad_norm": 0.7423616051673889, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.0248, + "num_tokens": 201264379.0, + "reward": 1.2924107313156128, + "reward_std": 0.17788228392601013, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2924107015132904, + "rewards/curriculum_aware_reward_fn/std": 0.3585638105869293, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 448.8839416503906, + "completions/mean_terminated_length": 448.8839416503906, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.7717307196285788, + "grad_norm": 0.655297577381134, + "kl": 0.0926513671875, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 201379523.0, + "reward": 1.3901787996292114, + "reward_std": 0.15744253993034363, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3901785910129547, + "rewards/curriculum_aware_reward_fn/std": 0.3982981741428375, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 432.6875305175781, + "completions/mean_terminated_length": 432.6875305175781, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 1.7727624451895796, + "grad_norm": 0.8091326951980591, + "kl": 0.099609375, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 201497606.0, + "reward": 1.3633930683135986, + "reward_std": 0.1997928023338318, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3633928894996643, + "rewards/curriculum_aware_reward_fn/std": 0.3603665828704834, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 476.89288330078125, + "completions/mean_terminated_length": 476.89288330078125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.7737941707505804, + "grad_norm": 0.6853758692741394, + "kl": 0.0947265625, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 201617784.0, + "reward": 1.3906251192092896, + "reward_std": 0.13729803264141083, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.390625, + "rewards/curriculum_aware_reward_fn/std": 0.3858727812767029, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 384.0446472167969, + "completions/mean_terminated_length": 384.0446472167969, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 1.7748258963115813, + "grad_norm": 0.6238521933555603, + "kl": 0.102294921875, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 201722124.0, + "reward": 1.4767858982086182, + "reward_std": 0.10937154293060303, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.4864344000816345, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 413.3482360839844, + "completions/mean_terminated_length": 413.3482360839844, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.775857621872582, + "grad_norm": 0.8579419851303101, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": 0.0224, + "num_tokens": 201827779.0, + "reward": 1.2959821224212646, + "reward_std": 0.234655499458313, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29598215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.36423057317733765, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 438.26788330078125, + "completions/mean_terminated_length": 438.26788330078125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 1.7768893474335825, + "grad_norm": 0.8060879707336426, + "kl": 0.0941162109375, + "learning_rate": 1e-06, + "loss": 0.0327, + "num_tokens": 201941937.0, + "reward": 1.2674108743667603, + "reward_std": 0.17359891533851624, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2674107253551483, + "rewards/curriculum_aware_reward_fn/std": 0.32719871401786804, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 398.64288330078125, + "completions/mean_terminated_length": 398.64288330078125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 1.7779210729945834, + "grad_norm": 0.6622222065925598, + "kl": 0.0894775390625, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 202047618.0, + "reward": 1.4589287042617798, + "reward_std": 0.12163443863391876, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45892858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.39639222621917725, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 455.7500305175781, + "completions/mean_terminated_length": 455.7500305175781, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 1.7789527985555842, + "grad_norm": 0.7916659116744995, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": 0.0454, + "num_tokens": 202170846.0, + "reward": 1.4361608028411865, + "reward_std": 0.18718430399894714, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43616071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3482394516468048, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 426.9821472167969, + "completions/mean_terminated_length": 426.9821472167969, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 1.779984524116585, + "grad_norm": 0.719247043132782, + "kl": 0.0931396484375, + "learning_rate": 1e-06, + "loss": 0.0163, + "num_tokens": 202283768.0, + "reward": 1.3799108266830444, + "reward_std": 0.17570778727531433, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37991073727607727, + "rewards/curriculum_aware_reward_fn/std": 0.34271439909935, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 402.0089416503906, + "completions/mean_terminated_length": 402.0089416503906, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.7810162496775859, + "grad_norm": 0.7211610078811646, + "kl": 0.0865478515625, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 202392192.0, + "reward": 1.2428573369979858, + "reward_std": 0.22202114760875702, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.24285714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.35105100274086, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 437.5357360839844, + "completions/mean_terminated_length": 437.5357360839844, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 1.7820479752385865, + "grad_norm": 0.7633671164512634, + "kl": 0.087646484375, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 202501578.0, + "reward": 1.3321430683135986, + "reward_std": 0.16112089157104492, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3321428596973419, + "rewards/curriculum_aware_reward_fn/std": 0.3619179129600525, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 413.2946472167969, + "completions/mean_terminated_length": 413.2946472167969, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 1.7830797007995873, + "grad_norm": 0.703177273273468, + "kl": 0.091796875, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 202614129.0, + "reward": 1.4330357313156128, + "reward_std": 0.15755793452262878, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.368986576795578, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 341.8482360839844, + "completions/mean_terminated_length": 341.8482360839844, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 1.784111426360588, + "grad_norm": 0.7756032347679138, + "kl": 0.103515625, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 202709200.0, + "reward": 1.5513393878936768, + "reward_std": 0.1884058266878128, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.551339328289032, + "rewards/curriculum_aware_reward_fn/std": 0.44415420293807983, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 433.52679443359375, + "completions/mean_terminated_length": 433.52679443359375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.7851431519215888, + "grad_norm": 0.7666996717453003, + "kl": 0.0926513671875, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 202822050.0, + "reward": 1.3267858028411865, + "reward_std": 0.18164461851119995, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32678571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.35829484462738037, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 385.1875305175781, + "completions/mean_terminated_length": 385.1875305175781, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 1.7861748774825896, + "grad_norm": 0.796438455581665, + "kl": 0.089111328125, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 202927190.0, + "reward": 1.4678572416305542, + "reward_std": 0.1948614865541458, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4678571820259094, + "rewards/curriculum_aware_reward_fn/std": 0.3767366409301758, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 418.39288330078125, + "completions/mean_terminated_length": 418.39288330078125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.7872066030435905, + "grad_norm": 0.4892652630805969, + "kl": 0.0892333984375, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 203033690.0, + "reward": 1.2924108505249023, + "reward_std": 0.04891003295779228, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2924107015132904, + "rewards/curriculum_aware_reward_fn/std": 0.3325583040714264, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1191.0, + "completions/max_terminated_length": 1191.0, + "completions/mean_length": 416.01788330078125, + "completions/mean_terminated_length": 416.01788330078125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.7882383286045913, + "grad_norm": 0.7362436056137085, + "kl": 0.09326171875, + "learning_rate": 1e-06, + "loss": -0.0104, + "num_tokens": 203150830.0, + "reward": 1.4714287519454956, + "reward_std": 0.16051512956619263, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47142860293388367, + "rewards/curriculum_aware_reward_fn/std": 0.3712478578090668, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 411.89288330078125, + "completions/mean_terminated_length": 411.89288330078125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 1.789270054165592, + "grad_norm": 0.6482226252555847, + "kl": 0.094482421875, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 203261152.0, + "reward": 1.4950894117355347, + "reward_std": 0.12809297442436218, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.3854890465736389, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 430.3839416503906, + "completions/mean_terminated_length": 430.3839416503906, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 1.7903017797265928, + "grad_norm": 0.7845043540000916, + "kl": 0.108642578125, + "learning_rate": 1e-06, + "loss": -0.0281, + "num_tokens": 203385046.0, + "reward": 1.2857143878936768, + "reward_std": 0.20165906846523285, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2857142984867096, + "rewards/curriculum_aware_reward_fn/std": 0.34336888790130615, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 424.4375305175781, + "completions/mean_terminated_length": 424.4375305175781, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.7913335052875934, + "grad_norm": 0.7672021389007568, + "kl": 0.102294921875, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 203506601.0, + "reward": 1.3767858743667603, + "reward_std": 0.15465092658996582, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3767857253551483, + "rewards/curriculum_aware_reward_fn/std": 0.37059295177459717, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 445.8125305175781, + "completions/mean_terminated_length": 445.8125305175781, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 1.7923652308485942, + "grad_norm": 0.7034952640533447, + "kl": 0.10009765625, + "learning_rate": 1e-06, + "loss": 0.0168, + "num_tokens": 203632139.0, + "reward": 1.307142972946167, + "reward_std": 0.19115585088729858, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30714288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3540535867214203, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 441.294677734375, + "completions/mean_terminated_length": 441.294677734375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 1.793396956409595, + "grad_norm": 0.8159340620040894, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 203760027.0, + "reward": 1.2946430444717407, + "reward_std": 0.17885927855968475, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2946428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.32238954305648804, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 412.3035888671875, + "completions/mean_terminated_length": 412.3035888671875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 1.794428681970596, + "grad_norm": 0.7434394955635071, + "kl": 0.10009765625, + "learning_rate": 1e-06, + "loss": 0.0273, + "num_tokens": 203862289.0, + "reward": 1.3888394832611084, + "reward_std": 0.17571541666984558, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3888392746448517, + "rewards/curriculum_aware_reward_fn/std": 0.35461628437042236, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 438.6696472167969, + "completions/mean_terminated_length": 438.6696472167969, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.7954604075315967, + "grad_norm": 0.7299800515174866, + "kl": 0.097412109375, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 203986124.0, + "reward": 1.2584823369979858, + "reward_std": 0.16149833798408508, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2584821581840515, + "rewards/curriculum_aware_reward_fn/std": 0.33657577633857727, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1080.0, + "completions/max_terminated_length": 1080.0, + "completions/mean_length": 421.58038330078125, + "completions/mean_terminated_length": 421.58038330078125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.7964921330925974, + "grad_norm": 0.6581730842590332, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": 0.0217, + "num_tokens": 204098350.0, + "reward": 1.3035714626312256, + "reward_std": 0.13841554522514343, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3035714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.34833672642707825, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 407.1607360839844, + "completions/mean_terminated_length": 407.1607360839844, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 1.797523858653598, + "grad_norm": 0.7710352540016174, + "kl": 0.09814453125, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 204209640.0, + "reward": 1.5558037757873535, + "reward_std": 0.17522591352462769, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5558035969734192, + "rewards/curriculum_aware_reward_fn/std": 0.35878583788871765, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 424.6964416503906, + "completions/mean_terminated_length": 424.6964416503906, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 1.7985555842145988, + "grad_norm": 0.8840100765228271, + "kl": 0.1064453125, + "learning_rate": 1e-06, + "loss": -0.0236, + "num_tokens": 204322936.0, + "reward": 1.386160969734192, + "reward_std": 0.18574056029319763, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3861607015132904, + "rewards/curriculum_aware_reward_fn/std": 0.36442041397094727, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 431.20538330078125, + "completions/mean_terminated_length": 431.20538330078125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 1.7995873097755997, + "grad_norm": 0.801786482334137, + "kl": 0.099853515625, + "learning_rate": 1e-06, + "loss": 0.0202, + "num_tokens": 204449673.0, + "reward": 1.432142972946167, + "reward_std": 0.19646649062633514, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43214288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.37283048033714294, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 379.9196472167969, + "completions/mean_terminated_length": 379.9196472167969, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 1.8006190353366005, + "grad_norm": 0.7736554741859436, + "kl": 0.12744140625, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 204562252.0, + "reward": 1.4883930683135986, + "reward_std": 0.12401814758777618, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4883928894996643, + "rewards/curriculum_aware_reward_fn/std": 0.3882846534252167, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 445.1607360839844, + "completions/mean_terminated_length": 445.1607360839844, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 1.8016507608976013, + "grad_norm": 0.8602623343467712, + "kl": 0.105224609375, + "learning_rate": 1e-06, + "loss": 0.0268, + "num_tokens": 204678911.0, + "reward": 1.4656250476837158, + "reward_std": 0.24137361347675323, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.4218166768550873, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1195.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 441.6785888671875, + "completions/mean_terminated_length": 441.6785888671875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.802682486458602, + "grad_norm": 0.659216046333313, + "kl": 0.0859375, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 204797028.0, + "reward": 1.4169642925262451, + "reward_std": 0.11663386970758438, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4169642925262451, + "rewards/curriculum_aware_reward_fn/std": 0.4146828353404999, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 418.1964416503906, + "completions/mean_terminated_length": 418.1964416503906, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 1.8037142120196028, + "grad_norm": 0.7895244359970093, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 204921804.0, + "reward": 1.3651787042617798, + "reward_std": 0.1474582552909851, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36517858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3904743790626526, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 430.0357360839844, + "completions/mean_terminated_length": 430.0357360839844, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 1.8047459375806034, + "grad_norm": 0.8099697232246399, + "kl": 0.09326171875, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 205040122.0, + "reward": 1.3459821939468384, + "reward_std": 0.2298395037651062, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3549107015132904, + "rewards/curriculum_aware_reward_fn/std": 0.4184354841709137, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 431.3571472167969, + "completions/mean_terminated_length": 431.3571472167969, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.8057776631416043, + "grad_norm": 0.600109338760376, + "kl": 0.09130859375, + "learning_rate": 1e-06, + "loss": 0.0148, + "num_tokens": 205154024.0, + "reward": 1.3263393640518188, + "reward_std": 0.09809020906686783, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3263392746448517, + "rewards/curriculum_aware_reward_fn/std": 0.4202616214752197, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 410.6785888671875, + "completions/mean_terminated_length": 410.6785888671875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 1.806809388702605, + "grad_norm": 0.8304281234741211, + "kl": 0.1104736328125, + "learning_rate": 1e-06, + "loss": -0.0238, + "num_tokens": 205262026.0, + "reward": 1.4589287042617798, + "reward_std": 0.2656938135623932, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.47678568959236145, + "rewards/curriculum_aware_reward_fn/std": 0.4030809700489044, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1187.0, + "completions/max_terminated_length": 1187.0, + "completions/mean_length": 411.7946472167969, + "completions/mean_terminated_length": 411.7946472167969, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 1.807841114263606, + "grad_norm": 0.5996778011322021, + "kl": 0.0955810546875, + "learning_rate": 1e-06, + "loss": 0.0341, + "num_tokens": 205372232.0, + "reward": 1.3665179014205933, + "reward_std": 0.12944695353507996, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3665178418159485, + "rewards/curriculum_aware_reward_fn/std": 0.3909095525741577, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 408.6071472167969, + "completions/mean_terminated_length": 408.6071472167969, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 1.8088728398246068, + "grad_norm": 0.7588355541229248, + "kl": 0.0958251953125, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 205492504.0, + "reward": 1.5517857074737549, + "reward_std": 0.18035660684108734, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5517856478691101, + "rewards/curriculum_aware_reward_fn/std": 0.44508975744247437, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 431.0000305175781, + "completions/mean_terminated_length": 431.0000305175781, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 1.8099045653856074, + "grad_norm": 0.770481526851654, + "kl": 0.0980224609375, + "learning_rate": 1e-06, + "loss": -0.0309, + "num_tokens": 205607066.0, + "reward": 1.4549108743667603, + "reward_std": 0.14845071732997894, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4549106955528259, + "rewards/curriculum_aware_reward_fn/std": 0.41622281074523926, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 393.0000305175781, + "completions/mean_terminated_length": 393.0000305175781, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.8109362909466082, + "grad_norm": 0.7532888054847717, + "kl": 0.0965576171875, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 205712038.0, + "reward": 1.3513394594192505, + "reward_std": 0.1509929597377777, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35133928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.42418229579925537, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 446.6785888671875, + "completions/mean_terminated_length": 446.6785888671875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.8119680165076089, + "grad_norm": 0.7780160903930664, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 205833949.0, + "reward": 1.2696428298950195, + "reward_std": 0.20583878457546234, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2696428596973419, + "rewards/curriculum_aware_reward_fn/std": 0.3794470727443695, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 359.6071472167969, + "completions/mean_terminated_length": 359.6071472167969, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.8129997420686097, + "grad_norm": 0.631820023059845, + "kl": 0.1031494140625, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 205930848.0, + "reward": 1.600000023841858, + "reward_std": 0.08937109261751175, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131, + "rewards/curriculum_aware_reward_fn/std": 0.42181503772735596, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 444.5089416503906, + "completions/mean_terminated_length": 444.5089416503906, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 1.8140314676296105, + "grad_norm": 0.8031406402587891, + "kl": 0.0958251953125, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 206046517.0, + "reward": 1.399553656578064, + "reward_std": 0.17127352952957153, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3995535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.4392610490322113, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 370.4910888671875, + "completions/mean_terminated_length": 370.4910888671875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 1.8150631931906114, + "grad_norm": 0.7603376507759094, + "kl": 0.1036376953125, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 206148289.0, + "reward": 1.7169642448425293, + "reward_std": 0.1733575463294983, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7169643044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3697182834148407, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 425.9375305175781, + "completions/mean_terminated_length": 425.9375305175781, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 1.8160949187516122, + "grad_norm": 0.81327223777771, + "kl": 0.1011962890625, + "learning_rate": 1e-06, + "loss": 0.0403, + "num_tokens": 206261311.0, + "reward": 1.4276787042617798, + "reward_std": 0.20803497731685638, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.4219666123390198, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 409.52679443359375, + "completions/mean_terminated_length": 409.52679443359375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.8171266443126128, + "grad_norm": 0.8098947405815125, + "kl": 0.10400390625, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 206377388.0, + "reward": 1.594642996788025, + "reward_std": 0.3190336227416992, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5946428179740906, + "rewards/curriculum_aware_reward_fn/std": 0.48543453216552734, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1371.0, + "completions/max_terminated_length": 1371.0, + "completions/mean_length": 454.1607360839844, + "completions/mean_terminated_length": 454.1607360839844, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 1.8181583698736135, + "grad_norm": 0.7259615659713745, + "kl": 0.095458984375, + "learning_rate": 1e-06, + "loss": -0.0238, + "num_tokens": 206494936.0, + "reward": 1.5138393640518188, + "reward_std": 0.18669991195201874, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5138392448425293, + "rewards/curriculum_aware_reward_fn/std": 0.4641280770301819, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 478.3214416503906, + "completions/mean_terminated_length": 478.3214416503906, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 1.8191900954346143, + "grad_norm": 0.7194581031799316, + "kl": 0.1046142578125, + "learning_rate": 1e-06, + "loss": -0.0265, + "num_tokens": 206617808.0, + "reward": 1.4325894117355347, + "reward_std": 0.19652755558490753, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43258926272392273, + "rewards/curriculum_aware_reward_fn/std": 0.39896881580352783, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2077.0, + "completions/max_terminated_length": 2077.0, + "completions/mean_length": 488.3214416503906, + "completions/mean_terminated_length": 488.3214416503906, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.8202218209956151, + "grad_norm": 0.7811593413352966, + "kl": 0.0987548828125, + "learning_rate": 1e-06, + "loss": 0.0242, + "num_tokens": 206734870.0, + "reward": 1.3196429014205933, + "reward_std": 0.2123473733663559, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3196428418159485, + "rewards/curriculum_aware_reward_fn/std": 0.3914259672164917, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 430.2857360839844, + "completions/mean_terminated_length": 430.2857360839844, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.821253546556616, + "grad_norm": 0.6582061648368835, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 206848246.0, + "reward": 1.4267858266830444, + "reward_std": 0.1399049013853073, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4267857074737549, + "rewards/curriculum_aware_reward_fn/std": 0.44957080483436584, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 409.4821472167969, + "completions/mean_terminated_length": 409.4821472167969, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.8222852721176168, + "grad_norm": 0.7228449583053589, + "kl": 0.1021728515625, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 206957479.0, + "reward": 1.4669643640518188, + "reward_std": 0.20963212847709656, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4669642746448517, + "rewards/curriculum_aware_reward_fn/std": 0.44268599152565, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 437.0982360839844, + "completions/mean_terminated_length": 437.0982360839844, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 1.8233169976786174, + "grad_norm": 0.8272615075111389, + "kl": 0.108642578125, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 207069854.0, + "reward": 1.3575893640518188, + "reward_std": 0.22051258385181427, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3575893044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3869032859802246, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 451.5089416503906, + "completions/mean_terminated_length": 451.5089416503906, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.8243487232396183, + "grad_norm": 0.8047498464584351, + "kl": 0.10888671875, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 207184970.0, + "reward": 1.3651785850524902, + "reward_std": 0.19732783734798431, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36517858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.40862545371055603, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 461.15179443359375, + "completions/mean_terminated_length": 461.15179443359375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 1.825380448800619, + "grad_norm": 0.7979851961135864, + "kl": 0.1019287109375, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 207306444.0, + "reward": 1.2544643878936768, + "reward_std": 0.1744467318058014, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2544642984867096, + "rewards/curriculum_aware_reward_fn/std": 0.30913665890693665, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 467.232177734375, + "completions/mean_terminated_length": 467.232177734375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 1.8264121743616197, + "grad_norm": 0.7705732583999634, + "kl": 0.10009765625, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 207430769.0, + "reward": 1.4004465341567993, + "reward_std": 0.14401130378246307, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40044644474983215, + "rewards/curriculum_aware_reward_fn/std": 0.41549360752105713, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 471.01788330078125, + "completions/mean_terminated_length": 471.01788330078125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 1.8274438999226206, + "grad_norm": 0.7397488355636597, + "kl": 0.091796875, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 207554125.0, + "reward": 1.4316965341567993, + "reward_std": 0.2323623150587082, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43169641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.4476664066314697, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 455.2500305175781, + "completions/mean_terminated_length": 455.2500305175781, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 1.8284756254836214, + "grad_norm": 0.8286288380622864, + "kl": 0.1236572265625, + "learning_rate": 1e-06, + "loss": 0.0351, + "num_tokens": 207670976.0, + "reward": 1.411607265472412, + "reward_std": 0.2313155084848404, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41160711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.38007715344429016, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 467.8750305175781, + "completions/mean_terminated_length": 467.8750305175781, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 1.8295073510446223, + "grad_norm": 0.7537168860435486, + "kl": 0.1103515625, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 207787561.0, + "reward": 1.376339316368103, + "reward_std": 0.24346770346164703, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37633928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3885732591152191, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 454.4107360839844, + "completions/mean_terminated_length": 454.4107360839844, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 1.8305390766056229, + "grad_norm": 0.6410990953445435, + "kl": 0.103759765625, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 207905732.0, + "reward": 1.3656251430511475, + "reward_std": 0.14624857902526855, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3656249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.4483719766139984, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1396.0, + "completions/max_terminated_length": 1396.0, + "completions/mean_length": 499.15179443359375, + "completions/mean_terminated_length": 499.15179443359375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.8315708021666237, + "grad_norm": 0.7663962841033936, + "kl": 0.107177734375, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 208023608.0, + "reward": 1.322767972946167, + "reward_std": 0.17575961351394653, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.32276788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.39013099670410156, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1136.0, + "completions/max_terminated_length": 1136.0, + "completions/mean_length": 465.7589416503906, + "completions/mean_terminated_length": 465.7589416503906, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.8326025277276243, + "grad_norm": 0.6929665803909302, + "kl": 0.0963134765625, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 208142805.0, + "reward": 1.4830358028411865, + "reward_std": 0.1856861263513565, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48303574323654175, + "rewards/curriculum_aware_reward_fn/std": 0.4536415934562683, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 457.0714416503906, + "completions/mean_terminated_length": 457.0714416503906, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 1.8336342532886252, + "grad_norm": 0.639695942401886, + "kl": 0.1048583984375, + "learning_rate": 1e-06, + "loss": 0.0176, + "num_tokens": 208263640.0, + "reward": 1.345089316368103, + "reward_std": 0.14022766053676605, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34508928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4171416759490967, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 476.6250305175781, + "completions/mean_terminated_length": 476.6250305175781, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 1.834665978849626, + "grad_norm": 0.6727584004402161, + "kl": 0.10595703125, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 208385531.0, + "reward": 1.3147321939468384, + "reward_std": 0.20331941545009613, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3147321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.38913384079933167, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 427.3214416503906, + "completions/mean_terminated_length": 427.3214416503906, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 1.8356977044106269, + "grad_norm": 0.7415899038314819, + "kl": 0.104736328125, + "learning_rate": 1e-06, + "loss": 0.0259, + "num_tokens": 208503153.0, + "reward": 1.553125023841858, + "reward_std": 0.24731476604938507, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5531249642372131, + "rewards/curriculum_aware_reward_fn/std": 0.43346917629241943, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 451.3839416503906, + "completions/mean_terminated_length": 451.3839416503906, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.8367294299716277, + "grad_norm": 0.7726976275444031, + "kl": 0.1026611328125, + "learning_rate": 1e-06, + "loss": 0.0245, + "num_tokens": 208618682.0, + "reward": 1.4790178537368774, + "reward_std": 0.2106853425502777, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47901788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.4393196105957031, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 490.01788330078125, + "completions/mean_terminated_length": 490.01788330078125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 1.8377611555326283, + "grad_norm": 0.8864200115203857, + "kl": 0.1116943359375, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 208744215.0, + "reward": 1.4366071224212646, + "reward_std": 0.30744442343711853, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.4401567280292511, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 426.9285888671875, + "completions/mean_terminated_length": 426.9285888671875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 1.8387928810936292, + "grad_norm": 0.6458997130393982, + "kl": 0.1063232421875, + "learning_rate": 1e-06, + "loss": 0.027, + "num_tokens": 208861933.0, + "reward": 1.6839287281036377, + "reward_std": 0.16594833135604858, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6839285492897034, + "rewards/curriculum_aware_reward_fn/std": 0.39713016152381897, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 453.6785888671875, + "completions/mean_terminated_length": 453.6785888671875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 1.8398246066546298, + "grad_norm": 0.821609377861023, + "kl": 0.1063232421875, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 208980379.0, + "reward": 1.4950894117355347, + "reward_std": 0.28672200441360474, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49508926272392273, + "rewards/curriculum_aware_reward_fn/std": 0.44076913595199585, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 454.5982360839844, + "completions/mean_terminated_length": 454.5982360839844, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 1.8408563322156306, + "grad_norm": 0.7601539492607117, + "kl": 0.100341796875, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 209104171.0, + "reward": 1.4446429014205933, + "reward_std": 0.21672558784484863, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4446428418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4232732355594635, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 512.4375, + "completions/mean_terminated_length": 512.4375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 1.8418880577766314, + "grad_norm": 0.7791252732276917, + "kl": 0.1077880859375, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 209236793.0, + "reward": 1.2834821939468384, + "reward_std": 0.20960944890975952, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2834821343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3648638129234314, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 451.607177734375, + "completions/mean_terminated_length": 451.607177734375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.8429197833376323, + "grad_norm": 0.7502228617668152, + "kl": 0.0966796875, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 209355903.0, + "reward": 1.3714288473129272, + "reward_std": 0.20309315621852875, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37142854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3378998637199402, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 416.0089416503906, + "completions/mean_terminated_length": 416.0089416503906, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 1.8439515088986331, + "grad_norm": 0.8292548656463623, + "kl": 0.101318359375, + "learning_rate": 1e-06, + "loss": 0.0298, + "num_tokens": 209468996.0, + "reward": 1.6040180921554565, + "reward_std": 0.2973346710205078, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6040178537368774, + "rewards/curriculum_aware_reward_fn/std": 0.43434420228004456, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 495.5000305175781, + "completions/mean_terminated_length": 495.5000305175781, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 1.8449832344596337, + "grad_norm": 0.7564486861228943, + "kl": 0.098876953125, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 209600330.0, + "reward": 1.4656251668930054, + "reward_std": 0.20407995581626892, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.37907135486602783, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 426.65179443359375, + "completions/mean_terminated_length": 426.65179443359375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 1.8460149600206344, + "grad_norm": 0.7819154262542725, + "kl": 0.1094970703125, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 209712473.0, + "reward": 1.4290179014205933, + "reward_std": 0.19162015616893768, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4290178418159485, + "rewards/curriculum_aware_reward_fn/std": 0.41669216752052307, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 452.5982360839844, + "completions/mean_terminated_length": 452.5982360839844, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 1.8470466855816352, + "grad_norm": 0.6342150568962097, + "kl": 0.0968017578125, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 209828243.0, + "reward": 1.5089287757873535, + "reward_std": 0.1409773826599121, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5089285969734192, + "rewards/curriculum_aware_reward_fn/std": 0.4546166658401489, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 977.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 456.794677734375, + "completions/mean_terminated_length": 456.794677734375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 1.848078411142636, + "grad_norm": 0.7057480812072754, + "kl": 0.10302734375, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 209951112.0, + "reward": 1.4026787281036377, + "reward_std": 0.21222275495529175, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40267854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.411536306142807, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 445.669677734375, + "completions/mean_terminated_length": 445.669677734375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 1.8491101367036369, + "grad_norm": 0.7759240865707397, + "kl": 0.1025390625, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 210064846.0, + "reward": 1.3375000953674316, + "reward_std": 0.18568792939186096, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3375000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.3734125792980194, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 353.2321472167969, + "completions/mean_terminated_length": 353.2321472167969, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 1.8501418622646377, + "grad_norm": 0.6455899477005005, + "kl": 0.1112060546875, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 210162501.0, + "reward": 1.4767858982086182, + "reward_std": 0.15260063111782074, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.46197426319122314, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 444.15179443359375, + "completions/mean_terminated_length": 444.15179443359375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 1.8511735878256383, + "grad_norm": 0.7108426094055176, + "kl": 0.109130859375, + "learning_rate": 1e-06, + "loss": 0.0269, + "num_tokens": 210287020.0, + "reward": 1.6000001430511475, + "reward_std": 0.19978836178779602, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131, + "rewards/curriculum_aware_reward_fn/std": 0.43335065245628357, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 470.46429443359375, + "completions/mean_terminated_length": 470.46429443359375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 1.8522053133866392, + "grad_norm": 0.8278527855873108, + "kl": 0.10302734375, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 210412449.0, + "reward": 1.411607265472412, + "reward_std": 0.2243293821811676, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41160711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.430115282535553, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 398.8125305175781, + "completions/mean_terminated_length": 398.8125305175781, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 1.8532370389476398, + "grad_norm": 0.7692654728889465, + "kl": 0.107421875, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 210522580.0, + "reward": 1.4446429014205933, + "reward_std": 0.2773749828338623, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4446428418159485, + "rewards/curriculum_aware_reward_fn/std": 0.46165376901626587, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 405.1696472167969, + "completions/mean_terminated_length": 405.1696472167969, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 1.8542687645086406, + "grad_norm": 0.9778786897659302, + "kl": 0.1536865234375, + "learning_rate": 1e-06, + "loss": -0.0176, + "num_tokens": 210629895.0, + "reward": 1.532142996788025, + "reward_std": 0.22348810732364655, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5321428775787354, + "rewards/curriculum_aware_reward_fn/std": 0.4495171010494232, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 429.58929443359375, + "completions/mean_terminated_length": 429.58929443359375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.8553004900696415, + "grad_norm": 0.8595057129859924, + "kl": 0.0970458984375, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 210747470.0, + "reward": 1.4549108743667603, + "reward_std": 0.1990305483341217, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4549107253551483, + "rewards/curriculum_aware_reward_fn/std": 0.4242619276046753, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 465.1339416503906, + "completions/mean_terminated_length": 465.1339416503906, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.8563322156306423, + "grad_norm": 0.7282925844192505, + "kl": 0.0888671875, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 210867766.0, + "reward": 1.500892996788025, + "reward_std": 0.22749468684196472, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5008928179740906, + "rewards/curriculum_aware_reward_fn/std": 0.4586988091468811, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1087.0, + "completions/max_terminated_length": 1087.0, + "completions/mean_length": 453.3839416503906, + "completions/mean_terminated_length": 453.3839416503906, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 1.8573639411916432, + "grad_norm": 0.7182178497314453, + "kl": 0.1009521484375, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 210981307.0, + "reward": 1.3674107789993286, + "reward_std": 0.22563178837299347, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.4080667495727539, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 394.45538330078125, + "completions/mean_terminated_length": 394.45538330078125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.8583956667526438, + "grad_norm": 0.7896085977554321, + "kl": 0.1163330078125, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 211086596.0, + "reward": 1.6044644117355347, + "reward_std": 0.1873040646314621, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6044642329216003, + "rewards/curriculum_aware_reward_fn/std": 0.4967712461948395, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 420.95538330078125, + "completions/mean_terminated_length": 420.95538330078125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 1.8594273923136446, + "grad_norm": 0.8211988806724548, + "kl": 0.119384765625, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 211204093.0, + "reward": 1.5977680683135986, + "reward_std": 0.21861636638641357, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5977678298950195, + "rewards/curriculum_aware_reward_fn/std": 0.4041111171245575, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 418.3750305175781, + "completions/mean_terminated_length": 418.3750305175781, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 1.8604591178746452, + "grad_norm": 0.7635685205459595, + "kl": 0.1046142578125, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 211315358.0, + "reward": 1.289285659790039, + "reward_std": 0.1520286202430725, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.28928571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.37724870443344116, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 389.8839416503906, + "completions/mean_terminated_length": 389.8839416503906, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.861490843435646, + "grad_norm": 0.7321354150772095, + "kl": 0.1064453125, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 211423880.0, + "reward": 1.5392858982086182, + "reward_std": 0.17979271709918976, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5392857193946838, + "rewards/curriculum_aware_reward_fn/std": 0.4290902018547058, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 468.4464416503906, + "completions/mean_terminated_length": 468.4464416503906, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 1.862522568996647, + "grad_norm": 0.6947965025901794, + "kl": 0.117431640625, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 211542552.0, + "reward": 1.4254463911056519, + "reward_std": 0.18100708723068237, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43437501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.4794906675815582, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1368.0, + "completions/max_terminated_length": 1368.0, + "completions/mean_length": 430.95538330078125, + "completions/mean_terminated_length": 430.95538330078125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 1.8635542945576478, + "grad_norm": 0.6890603303909302, + "kl": 0.1099853515625, + "learning_rate": 1e-06, + "loss": -0.012, + "num_tokens": 211663024.0, + "reward": 1.3941963911056519, + "reward_std": 0.12359840422868729, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39419645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.42228174209594727, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1509.0, + "completions/max_terminated_length": 1509.0, + "completions/mean_length": 462.1875305175781, + "completions/mean_terminated_length": 462.1875305175781, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 1.8645860201186486, + "grad_norm": 0.7871446013450623, + "kl": 0.1162109375, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 211785636.0, + "reward": 1.46473228931427, + "reward_std": 0.22002539038658142, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4647321403026581, + "rewards/curriculum_aware_reward_fn/std": 0.5697572827339172, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 429.65179443359375, + "completions/mean_terminated_length": 429.65179443359375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 1.8656177456796492, + "grad_norm": 0.614851713180542, + "kl": 0.1131591796875, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 211902861.0, + "reward": 1.4138394594192505, + "reward_std": 0.10892920196056366, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41383928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.44343101978302, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1656.0, + "completions/max_terminated_length": 1656.0, + "completions/mean_length": 482.982177734375, + "completions/mean_terminated_length": 482.982177734375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.8666494712406498, + "grad_norm": 0.6550900340080261, + "kl": 0.107421875, + "learning_rate": 1e-06, + "loss": 0.0188, + "num_tokens": 212023945.0, + "reward": 1.2910715341567993, + "reward_std": 0.13532520830631256, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29107141494750977, + "rewards/curriculum_aware_reward_fn/std": 0.4194169044494629, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1476.0, + "completions/max_terminated_length": 1476.0, + "completions/mean_length": 448.6964416503906, + "completions/mean_terminated_length": 448.6964416503906, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 1.8676811968016507, + "grad_norm": 0.7046249508857727, + "kl": 0.1085205078125, + "learning_rate": 1e-06, + "loss": -0.011, + "num_tokens": 212141952.0, + "reward": 1.3834823369979858, + "reward_std": 0.16871017217636108, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3834821283817291, + "rewards/curriculum_aware_reward_fn/std": 0.4509443938732147, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 409.89288330078125, + "completions/mean_terminated_length": 409.89288330078125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.8687129223626515, + "grad_norm": 0.8548352122306824, + "kl": 0.124267578125, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 212256215.0, + "reward": 1.5258928537368774, + "reward_std": 0.21912138164043427, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774, + "rewards/curriculum_aware_reward_fn/std": 0.41611576080322266, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1222.0, + "completions/max_terminated_length": 1222.0, + "completions/mean_length": 450.4375305175781, + "completions/mean_terminated_length": 450.4375305175781, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.8697446479236524, + "grad_norm": 0.5630614161491394, + "kl": 0.089599609375, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 212377609.0, + "reward": 1.415178656578064, + "reward_std": 0.1340329647064209, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4151785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.4354163110256195, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1065.0, + "completions/max_terminated_length": 1065.0, + "completions/mean_length": 484.08038330078125, + "completions/mean_terminated_length": 484.08038330078125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 1.8707763734846532, + "grad_norm": 0.6303600668907166, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 212510296.0, + "reward": 1.2727680206298828, + "reward_std": 0.08729679882526398, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2727678418159485, + "rewards/curriculum_aware_reward_fn/std": 0.3543734848499298, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 388.5000305175781, + "completions/mean_terminated_length": 388.5000305175781, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.8718080990456538, + "grad_norm": 0.951493501663208, + "kl": 0.119873046875, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 212613168.0, + "reward": 1.5379464626312256, + "reward_std": 0.21481995284557343, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5379464030265808, + "rewards/curriculum_aware_reward_fn/std": 0.5394460558891296, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 413.89288330078125, + "completions/mean_terminated_length": 413.89288330078125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 1.8728398246066547, + "grad_norm": 0.8855923414230347, + "kl": 0.113525390625, + "learning_rate": 1e-06, + "loss": 0.0269, + "num_tokens": 212741116.0, + "reward": 1.3468750715255737, + "reward_std": 0.17701692879199982, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34687498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.3718278110027313, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1193.0, + "completions/max_terminated_length": 1193.0, + "completions/mean_length": 405.5446472167969, + "completions/mean_terminated_length": 405.5446472167969, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.8738715501676553, + "grad_norm": 0.7841637134552002, + "kl": 0.12060546875, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 212855620.0, + "reward": 1.4660714864730835, + "reward_std": 0.1883702427148819, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4660714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.4616955816745758, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 414.9464416503906, + "completions/mean_terminated_length": 414.9464416503906, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 1.8749032757286561, + "grad_norm": 0.7373315691947937, + "kl": 0.1258544921875, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 212972598.0, + "reward": 1.3808037042617798, + "reward_std": 0.16404660046100616, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38080358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.41812780499458313, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 416.71429443359375, + "completions/mean_terminated_length": 416.71429443359375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 1.875935001289657, + "grad_norm": 0.747146487236023, + "kl": 0.1004638671875, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 213077020.0, + "reward": 1.4812500476837158, + "reward_std": 0.19955526292324066, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48125001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.45217815041542053, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 415.3839416503906, + "completions/mean_terminated_length": 415.3839416503906, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.8769667268506578, + "grad_norm": 0.8933224081993103, + "kl": 0.127685546875, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 213185899.0, + "reward": 1.4156250953674316, + "reward_std": 0.176749125123024, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4156250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.4053710997104645, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 455.33929443359375, + "completions/mean_terminated_length": 455.33929443359375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 1.8779984524116586, + "grad_norm": 0.8117207884788513, + "kl": 0.1043701171875, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 213308066.0, + "reward": 1.3950893878936768, + "reward_std": 0.18113988637924194, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3950892984867096, + "rewards/curriculum_aware_reward_fn/std": 0.3946123719215393, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1113.0, + "completions/max_terminated_length": 1113.0, + "completions/mean_length": 413.1964416503906, + "completions/mean_terminated_length": 413.1964416503906, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 1.8790301779726593, + "grad_norm": 0.756941556930542, + "kl": 0.1058349609375, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 213422930.0, + "reward": 1.5508930683135986, + "reward_std": 0.21374936401844025, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5508928894996643, + "rewards/curriculum_aware_reward_fn/std": 0.4344397783279419, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 414.0089416503906, + "completions/mean_terminated_length": 414.0089416503906, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 1.88006190353366, + "grad_norm": 0.8877094388008118, + "kl": 0.114990234375, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 213543086.0, + "reward": 1.3727679252624512, + "reward_std": 0.17864759266376495, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3727678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.45340994000434875, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1414.0, + "completions/max_terminated_length": 1414.0, + "completions/mean_length": 418.51788330078125, + "completions/mean_terminated_length": 418.51788330078125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.8810936290946607, + "grad_norm": 0.835811972618103, + "kl": 0.1126708984375, + "learning_rate": 1e-06, + "loss": 0.0463, + "num_tokens": 213651168.0, + "reward": 1.4678572416305542, + "reward_std": 0.22902914881706238, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46785715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.41961249709129333, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 382.5714416503906, + "completions/mean_terminated_length": 382.5714416503906, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.8821253546556616, + "grad_norm": 0.8936151266098022, + "kl": 0.115966796875, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 213761405.0, + "reward": 1.4495537281036377, + "reward_std": 0.20912609994411469, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3940146565437317, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 358.3214416503906, + "completions/mean_terminated_length": 358.3214416503906, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 1.8831570802166624, + "grad_norm": 0.8577347993850708, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 213861037.0, + "reward": 1.4705358743667603, + "reward_std": 0.16662080585956573, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4705357253551483, + "rewards/curriculum_aware_reward_fn/std": 0.45205003023147583, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 445.08929443359375, + "completions/mean_terminated_length": 445.08929443359375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 1.8841888057776632, + "grad_norm": 0.8253490924835205, + "kl": 0.1148681640625, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 213988223.0, + "reward": 1.4049108028411865, + "reward_std": 0.3005301058292389, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40491071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4229326844215393, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 392.9107360839844, + "completions/mean_terminated_length": 392.9107360839844, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 1.885220531338664, + "grad_norm": 0.8072178363800049, + "kl": 0.111572265625, + "learning_rate": 1e-06, + "loss": 0.0163, + "num_tokens": 214099576.0, + "reward": 1.3821429014205933, + "reward_std": 0.1937621682882309, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709, + "rewards/curriculum_aware_reward_fn/std": 0.40790924429893494, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 420.2321472167969, + "completions/mean_terminated_length": 420.2321472167969, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 1.8862522568996647, + "grad_norm": 0.8928146958351135, + "kl": 0.1011962890625, + "learning_rate": 1e-06, + "loss": 0.0382, + "num_tokens": 214223089.0, + "reward": 1.352678656578064, + "reward_std": 0.1972290426492691, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3616071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.40559056401252747, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 381.1071472167969, + "completions/mean_terminated_length": 381.1071472167969, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.8872839824606653, + "grad_norm": 0.8089170455932617, + "kl": 0.11865234375, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 214333239.0, + "reward": 1.3714287281036377, + "reward_std": 0.19539318978786469, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37142854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.44039878249168396, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1148.0, + "completions/max_terminated_length": 1148.0, + "completions/mean_length": 434.2321472167969, + "completions/mean_terminated_length": 434.2321472167969, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.8883157080216662, + "grad_norm": 0.8478483557701111, + "kl": 0.1148681640625, + "learning_rate": 1e-06, + "loss": -0.0157, + "num_tokens": 214448422.0, + "reward": 1.3419644832611084, + "reward_std": 0.22933442890644073, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3419642746448517, + "rewards/curriculum_aware_reward_fn/std": 0.3999185562133789, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 387.33929443359375, + "completions/mean_terminated_length": 387.33929443359375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.889347433582667, + "grad_norm": 0.9443395733833313, + "kl": 0.1378173828125, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 214553222.0, + "reward": 1.5116074085235596, + "reward_std": 0.21648088097572327, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5116071105003357, + "rewards/curriculum_aware_reward_fn/std": 0.388168603181839, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 413.7500305175781, + "completions/mean_terminated_length": 413.7500305175781, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 1.8903791591436678, + "grad_norm": 0.9532350897789001, + "kl": 0.11474609375, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 214667658.0, + "reward": 1.44910728931427, + "reward_std": 0.28174877166748047, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4491071403026581, + "rewards/curriculum_aware_reward_fn/std": 0.3961516320705414, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 382.9196472167969, + "completions/mean_terminated_length": 382.9196472167969, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.8914108847046687, + "grad_norm": 0.7535876631736755, + "kl": 0.1142578125, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 214768435.0, + "reward": 1.3669644594192505, + "reward_std": 0.10936643183231354, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36696431040763855, + "rewards/curriculum_aware_reward_fn/std": 0.43617674708366394, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 371.0982360839844, + "completions/mean_terminated_length": 371.0982360839844, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.8924426102656693, + "grad_norm": 0.8579598665237427, + "kl": 0.13427734375, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 214875140.0, + "reward": 1.4062501192092896, + "reward_std": 0.22308197617530823, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40625, + "rewards/curriculum_aware_reward_fn/std": 0.4337727427482605, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 342.02679443359375, + "completions/mean_terminated_length": 342.02679443359375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 1.8934743358266701, + "grad_norm": 2.2421979904174805, + "kl": 0.490478515625, + "learning_rate": 1e-06, + "loss": 0.0286, + "num_tokens": 214990636.0, + "reward": 1.3776785135269165, + "reward_std": 0.1712455004453659, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.38660717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.3968900740146637, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 352.6875305175781, + "completions/mean_terminated_length": 352.6875305175781, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 1.8945060613876707, + "grad_norm": 0.8646315932273865, + "kl": 0.12158203125, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 215093662.0, + "reward": 1.5406250953674316, + "reward_std": 0.2044602483510971, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5495535731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4226415753364563, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 385.2946472167969, + "completions/mean_terminated_length": 385.2946472167969, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 1.8955377869486716, + "grad_norm": 0.8068304657936096, + "kl": 0.125244140625, + "learning_rate": 1e-06, + "loss": 0.0347, + "num_tokens": 215206269.0, + "reward": 1.320089340209961, + "reward_std": 0.1882740557193756, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.32901784777641296, + "rewards/curriculum_aware_reward_fn/std": 0.3777981698513031, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 386.5982360839844, + "completions/mean_terminated_length": 386.5982360839844, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 1.8965695125096724, + "grad_norm": 0.6929387450218201, + "kl": 0.1082763671875, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 215314311.0, + "reward": 1.5660713911056519, + "reward_std": 0.14396119117736816, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5660713911056519, + "rewards/curriculum_aware_reward_fn/std": 0.46548500657081604, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 384.9464416503906, + "completions/mean_terminated_length": 384.9464416503906, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.8976012380706733, + "grad_norm": 0.7868221402168274, + "kl": 0.1160888671875, + "learning_rate": 1e-06, + "loss": 0.0259, + "num_tokens": 215437491.0, + "reward": 1.4455357789993286, + "reward_std": 0.25672388076782227, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.45446425676345825, + "rewards/curriculum_aware_reward_fn/std": 0.46521106362342834, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1219.0, + "completions/max_terminated_length": 1219.0, + "completions/mean_length": 374.0089416503906, + "completions/mean_terminated_length": 374.0089416503906, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 1.898632963631674, + "grad_norm": 0.8789774775505066, + "kl": 0.1220703125, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 215551263.0, + "reward": 1.5174108743667603, + "reward_std": 0.2568131983280182, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5263392329216003, + "rewards/curriculum_aware_reward_fn/std": 0.4336602985858917, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 381.8482360839844, + "completions/mean_terminated_length": 348.38739013671875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 1.8996646891926747, + "grad_norm": 0.6855369806289673, + "kl": 0.1339111328125, + "learning_rate": 1e-06, + "loss": 0.0492, + "num_tokens": 215662868.0, + "reward": 1.5156251192092896, + "reward_std": 0.17239533364772797, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4515842795372009, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 356.4375305175781, + "completions/mean_terminated_length": 356.4375305175781, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 1.9006964147536756, + "grad_norm": 0.827491283416748, + "kl": 0.1326904296875, + "learning_rate": 1e-06, + "loss": 0.0239, + "num_tokens": 215771354.0, + "reward": 1.434821605682373, + "reward_std": 0.21416500210762024, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4348214268684387, + "rewards/curriculum_aware_reward_fn/std": 0.547674834728241, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 362.2857360839844, + "completions/mean_terminated_length": 362.2857360839844, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.9017281403146762, + "grad_norm": 0.8723616003990173, + "kl": 0.120849609375, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 215879081.0, + "reward": 1.509374976158142, + "reward_std": 0.2041958123445511, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5093749761581421, + "rewards/curriculum_aware_reward_fn/std": 0.42243027687072754, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 383.1160888671875, + "completions/mean_terminated_length": 383.1160888671875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 1.902759865875677, + "grad_norm": 0.7478457093238831, + "kl": 0.119384765625, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 215992333.0, + "reward": 1.4169644117355347, + "reward_std": 0.23326881229877472, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.42589282989501953, + "rewards/curriculum_aware_reward_fn/std": 0.45217105746269226, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 404.8571472167969, + "completions/mean_terminated_length": 404.8571472167969, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 1.9037915914366779, + "grad_norm": 0.8745235204696655, + "kl": 0.1361083984375, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 216103785.0, + "reward": 1.2758928537368774, + "reward_std": 0.21973289549350739, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3482986092567444, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 362.01788330078125, + "completions/mean_terminated_length": 362.01788330078125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.9048233169976787, + "grad_norm": 0.8361862897872925, + "kl": 0.1265869140625, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 216215773.0, + "reward": 1.4366072416305542, + "reward_std": 0.24635306000709534, + "rewards/code_format_reward/mean": 0.9642857313156128, + "rewards/code_format_reward/std": 0.18641093373298645, + "rewards/curriculum_aware_reward_fn/mean": 0.47232145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.44336140155792236, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 357.6250305175781, + "completions/mean_terminated_length": 357.6250305175781, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 1.9058550425586795, + "grad_norm": 0.8441395163536072, + "kl": 0.1273193359375, + "learning_rate": 1e-06, + "loss": -0.022, + "num_tokens": 216324245.0, + "reward": 1.4339287281036377, + "reward_std": 0.28452068567276, + "rewards/code_format_reward/mean": 0.9642857313156128, + "rewards/code_format_reward/std": 0.18641091883182526, + "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296, + "rewards/curriculum_aware_reward_fn/std": 0.4514918625354767, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 376.77679443359375, + "completions/mean_terminated_length": 376.77679443359375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.9068867681196802, + "grad_norm": 0.9068071842193604, + "kl": 0.11669921875, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 216434743.0, + "reward": 1.4044644832611084, + "reward_std": 0.2747339606285095, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.42232146859169006, + "rewards/curriculum_aware_reward_fn/std": 0.42853403091430664, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 374.9285888671875, + "completions/mean_terminated_length": 374.9285888671875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 1.9079184936806808, + "grad_norm": 0.8925588726997375, + "kl": 0.12060546875, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 216550270.0, + "reward": 1.2790179252624512, + "reward_std": 0.20408707857131958, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.296875, + "rewards/curriculum_aware_reward_fn/std": 0.3787890374660492, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 348.0625305175781, + "completions/mean_terminated_length": 348.0625305175781, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 1.9089502192416816, + "grad_norm": 0.9206534624099731, + "kl": 0.1251220703125, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 216657874.0, + "reward": 1.3379465341567993, + "reward_std": 0.18383868038654327, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.35580354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3915058374404907, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 365.8482360839844, + "completions/mean_terminated_length": 365.8482360839844, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.9099819448026825, + "grad_norm": 0.9138371348381042, + "kl": 0.116943359375, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 216758622.0, + "reward": 1.4017857313156128, + "reward_std": 0.26434412598609924, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096, + "rewards/curriculum_aware_reward_fn/std": 0.4232770502567291, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 361.1160888671875, + "completions/mean_terminated_length": 361.1160888671875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.9110136703636833, + "grad_norm": 0.9135972857475281, + "kl": 0.13037109375, + "learning_rate": 1e-06, + "loss": -0.0204, + "num_tokens": 216867989.0, + "reward": 1.4691966772079468, + "reward_std": 0.2713601589202881, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.4959821403026581, + "rewards/curriculum_aware_reward_fn/std": 0.44235947728157043, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 320.90179443359375, + "completions/mean_terminated_length": 320.90179443359375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 1.9120453959246841, + "grad_norm": 1.002255916595459, + "kl": 0.123291015625, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 216961547.0, + "reward": 1.5370535850524902, + "reward_std": 0.27375054359436035, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5459821820259094, + "rewards/curriculum_aware_reward_fn/std": 0.49616265296936035, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 346.6964416503906, + "completions/mean_terminated_length": 346.6964416503906, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 1.9130771214856848, + "grad_norm": 0.9413928389549255, + "kl": 0.1341552734375, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 217071658.0, + "reward": 1.391517996788025, + "reward_std": 0.24773217737674713, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39151784777641296, + "rewards/curriculum_aware_reward_fn/std": 0.43365657329559326, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 319.1339416503906, + "completions/mean_terminated_length": 319.1339416503906, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.9141088470466856, + "grad_norm": 0.9463006854057312, + "kl": 0.1324462890625, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 217172347.0, + "reward": 1.5656250715255737, + "reward_std": 0.23232321441173553, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5745535492897034, + "rewards/curriculum_aware_reward_fn/std": 0.4415367841720581, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 361.3035888671875, + "completions/mean_terminated_length": 361.3035888671875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 1.9151405726076862, + "grad_norm": 0.8216485977172852, + "kl": 0.1285400390625, + "learning_rate": 1e-06, + "loss": 0.0237, + "num_tokens": 217275071.0, + "reward": 1.379017949104309, + "reward_std": 0.1718110740184784, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3879464268684387, + "rewards/curriculum_aware_reward_fn/std": 0.3956546485424042, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 360.83929443359375, + "completions/mean_terminated_length": 360.83929443359375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 1.916172298168687, + "grad_norm": 0.9167246222496033, + "kl": 0.1119384765625, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 217371968.0, + "reward": 1.4901785850524902, + "reward_std": 0.1831408590078354, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49017858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3973923921585083, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 341.6339416503906, + "completions/mean_terminated_length": 341.6339416503906, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 1.917204023729688, + "grad_norm": 0.8857439160346985, + "kl": 0.1221923828125, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 217477886.0, + "reward": 1.5772322416305542, + "reward_std": 0.25595399737358093, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5772321820259094, + "rewards/curriculum_aware_reward_fn/std": 0.5594353675842285, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 349.9464416503906, + "completions/mean_terminated_length": 349.9464416503906, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 1.9182357492906887, + "grad_norm": 0.9459420442581177, + "kl": 0.1263427734375, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 217581256.0, + "reward": 1.3508927822113037, + "reward_std": 0.1190386638045311, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3508928418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4276471436023712, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 358.6339416503906, + "completions/mean_terminated_length": 358.6339416503906, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.9192674748516896, + "grad_norm": 0.9291829466819763, + "kl": 0.1224365234375, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 217687454.0, + "reward": 1.5843751430511475, + "reward_std": 0.2358408272266388, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5843749642372131, + "rewards/curriculum_aware_reward_fn/std": 0.37901195883750916, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 395.1250305175781, + "completions/mean_terminated_length": 395.1250305175781, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 1.9202992004126902, + "grad_norm": 0.7725751996040344, + "kl": 0.104736328125, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 217811746.0, + "reward": 1.4312500953674316, + "reward_std": 0.18673662841320038, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4491071403026581, + "rewards/curriculum_aware_reward_fn/std": 0.4288042187690735, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 313.08038330078125, + "completions/mean_terminated_length": 313.08038330078125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 1.921330925973691, + "grad_norm": 0.836054801940918, + "kl": 0.133544921875, + "learning_rate": 1e-06, + "loss": -0.0099, + "num_tokens": 217908082.0, + "reward": 1.7017858028411865, + "reward_std": 0.2549782395362854, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.7107143402099609, + "rewards/curriculum_aware_reward_fn/std": 0.39770904183387756, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 384.2232360839844, + "completions/mean_terminated_length": 384.2232360839844, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.9223626515346917, + "grad_norm": 0.8646076917648315, + "kl": 0.1246337890625, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 218020282.0, + "reward": 1.3397324085235596, + "reward_std": 0.25197386741638184, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3486607074737549, + "rewards/curriculum_aware_reward_fn/std": 0.388022243976593, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 362.3660888671875, + "completions/mean_terminated_length": 362.3660888671875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 1.9233943770956925, + "grad_norm": 0.8585340976715088, + "kl": 0.121337890625, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 218131511.0, + "reward": 1.6879466772079468, + "reward_std": 0.2607966363430023, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6879464387893677, + "rewards/curriculum_aware_reward_fn/std": 0.36743852496147156, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 360.76788330078125, + "completions/mean_terminated_length": 360.76788330078125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.9244261026566933, + "grad_norm": 0.7554998397827148, + "kl": 0.1197509765625, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 218241917.0, + "reward": 1.380357265472412, + "reward_std": 0.15899653732776642, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.38928571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.447486937046051, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 344.2946472167969, + "completions/mean_terminated_length": 344.2946472167969, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 1.9254578282176942, + "grad_norm": 0.8225184082984924, + "kl": 0.15380859375, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 218339964.0, + "reward": 1.528571605682373, + "reward_std": 0.14810487627983093, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5285714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.40640780329704285, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 345.2589416503906, + "completions/mean_terminated_length": 345.2589416503906, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 1.926489553778695, + "grad_norm": 0.8637830018997192, + "kl": 0.113525390625, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 218435718.0, + "reward": 1.4928573369979858, + "reward_std": 0.2717527449131012, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4928571581840515, + "rewards/curriculum_aware_reward_fn/std": 0.4726700484752655, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 353.9196472167969, + "completions/mean_terminated_length": 353.9196472167969, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 1.9275212793396956, + "grad_norm": 0.887877881526947, + "kl": 0.134033203125, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 218547618.0, + "reward": 1.3991073369979858, + "reward_std": 0.2046002745628357, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4169642925262451, + "rewards/curriculum_aware_reward_fn/std": 0.42461180686950684, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 366.3214416503906, + "completions/mean_terminated_length": 366.3214416503906, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.9285530049006963, + "grad_norm": 0.8546643853187561, + "kl": 0.13525390625, + "learning_rate": 1e-06, + "loss": 0.025, + "num_tokens": 218661201.0, + "reward": 1.4066965579986572, + "reward_std": 0.2563186585903168, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40669646859169006, + "rewards/curriculum_aware_reward_fn/std": 0.431657999753952, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 393.33929443359375, + "completions/mean_terminated_length": 393.33929443359375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 1.929584730461697, + "grad_norm": 0.8663255572319031, + "kl": 0.116455078125, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 218770761.0, + "reward": 1.5607144832611084, + "reward_std": 0.2684311270713806, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5607143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.4703567922115326, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 386.4910888671875, + "completions/mean_terminated_length": 386.4910888671875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.930616456022698, + "grad_norm": 0.8403106331825256, + "kl": 0.1114501953125, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 218880758.0, + "reward": 1.508928656578064, + "reward_std": 0.27292829751968384, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5089285373687744, + "rewards/curriculum_aware_reward_fn/std": 0.5379396080970764, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 344.1250305175781, + "completions/mean_terminated_length": 344.1250305175781, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 1.9316481815836988, + "grad_norm": 0.9799405932426453, + "kl": 0.1356201171875, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 218978660.0, + "reward": 1.2950893640518188, + "reward_std": 0.16900257766246796, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2950892746448517, + "rewards/curriculum_aware_reward_fn/std": 0.3908848762512207, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 384.15179443359375, + "completions/mean_terminated_length": 384.15179443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 1.9326799071446996, + "grad_norm": 0.8737776875495911, + "kl": 0.11572265625, + "learning_rate": 1e-06, + "loss": -0.0274, + "num_tokens": 219086967.0, + "reward": 1.368303656578064, + "reward_std": 0.1639767438173294, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3683035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.431423157453537, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 349.2410888671875, + "completions/mean_terminated_length": 349.2410888671875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 1.9337116327057002, + "grad_norm": 0.9632400274276733, + "kl": 0.13720703125, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 219193772.0, + "reward": 1.5540179014205933, + "reward_std": 0.20878908038139343, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5540178418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4543651342391968, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 968.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 403.5000305175781, + "completions/mean_terminated_length": 403.5000305175781, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 1.934743358266701, + "grad_norm": 0.6932012438774109, + "kl": 0.112060546875, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 219314522.0, + "reward": 1.4258930683135986, + "reward_std": 0.15361928939819336, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42589282989501953, + "rewards/curriculum_aware_reward_fn/std": 0.4594837427139282, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 391.3839416503906, + "completions/mean_terminated_length": 391.3839416503906, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 1.9357750838277017, + "grad_norm": 0.9562661647796631, + "kl": 0.1275634765625, + "learning_rate": 1e-06, + "loss": -0.0157, + "num_tokens": 219425620.0, + "reward": 1.3147321939468384, + "reward_std": 0.23054131865501404, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3147321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.37039291858673096, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 416.7321472167969, + "completions/mean_terminated_length": 416.7321472167969, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.9368068093887025, + "grad_norm": 0.8335247039794922, + "kl": 0.1182861328125, + "learning_rate": 1e-06, + "loss": -0.0149, + "num_tokens": 219540887.0, + "reward": 1.3441965579986572, + "reward_std": 0.25252988934516907, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3441964089870453, + "rewards/curriculum_aware_reward_fn/std": 0.3944288492202759, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 385.89288330078125, + "completions/mean_terminated_length": 385.89288330078125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.9378385349497034, + "grad_norm": 0.8629086017608643, + "kl": 0.1168212890625, + "learning_rate": 1e-06, + "loss": -0.0315, + "num_tokens": 219650754.0, + "reward": 1.377678632736206, + "reward_std": 0.1759658306837082, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581, + "rewards/curriculum_aware_reward_fn/std": 0.4390295147895813, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 338.3214416503906, + "completions/mean_terminated_length": 338.3214416503906, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.9388702605107042, + "grad_norm": 0.8721398115158081, + "kl": 0.12548828125, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 219751233.0, + "reward": 1.4821429252624512, + "reward_std": 0.1664353460073471, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4910714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.4412418007850647, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 370.21429443359375, + "completions/mean_terminated_length": 370.21429443359375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 1.939901986071705, + "grad_norm": 0.8873890042304993, + "kl": 0.1141357421875, + "learning_rate": 1e-06, + "loss": -0.024, + "num_tokens": 219851847.0, + "reward": 1.4955357313156128, + "reward_std": 0.2472870945930481, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4955357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.45458391308784485, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 426.7232360839844, + "completions/mean_terminated_length": 426.7232360839844, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.9409337116327057, + "grad_norm": 0.6784756183624268, + "kl": 0.108154296875, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 219969122.0, + "reward": 1.3883929252624512, + "reward_std": 0.13435271382331848, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3883928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4352167248725891, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 393.9107360839844, + "completions/mean_terminated_length": 393.9107360839844, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.9419654371937065, + "grad_norm": 0.8890510201454163, + "kl": 0.1182861328125, + "learning_rate": 1e-06, + "loss": -0.0259, + "num_tokens": 220083674.0, + "reward": 1.3825894594192505, + "reward_std": 0.20253174006938934, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38258928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.429052472114563, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 370.89288330078125, + "completions/mean_terminated_length": 370.89288330078125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.9429971627547071, + "grad_norm": 0.7917482256889343, + "kl": 0.1241455078125, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 220191504.0, + "reward": 1.5214287042617798, + "reward_std": 0.14497151970863342, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5214285850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4005465805530548, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 404.6160888671875, + "completions/mean_terminated_length": 404.6160888671875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.944028888315708, + "grad_norm": 0.8562974333763123, + "kl": 0.13427734375, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 220300983.0, + "reward": 1.3504464626312256, + "reward_std": 0.1296975463628769, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.41435563564300537, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 408.2946472167969, + "completions/mean_terminated_length": 408.2946472167969, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 1.9450606138767088, + "grad_norm": 1.1476508378982544, + "kl": 0.1634521484375, + "learning_rate": 1e-06, + "loss": -0.0169, + "num_tokens": 220421668.0, + "reward": 1.2767857313156128, + "reward_std": 0.19308973848819733, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2767857015132904, + "rewards/curriculum_aware_reward_fn/std": 0.363660603761673, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 415.1607360839844, + "completions/mean_terminated_length": 415.1607360839844, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.9460923394377097, + "grad_norm": 0.7268807888031006, + "kl": 0.121337890625, + "learning_rate": 1e-06, + "loss": 0.0144, + "num_tokens": 220539909.0, + "reward": 1.4200893640518188, + "reward_std": 0.18233086168766022, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4200892746448517, + "rewards/curriculum_aware_reward_fn/std": 0.4332186281681061, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1001.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 421.7589416503906, + "completions/mean_terminated_length": 421.7589416503906, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 1.9471240649987105, + "grad_norm": 0.8596259951591492, + "kl": 0.121337890625, + "learning_rate": 1e-06, + "loss": 0.0174, + "num_tokens": 220650363.0, + "reward": 1.4325894117355347, + "reward_std": 0.20996662974357605, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43258926272392273, + "rewards/curriculum_aware_reward_fn/std": 0.4076802134513855, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 403.65179443359375, + "completions/mean_terminated_length": 403.65179443359375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 1.948155790559711, + "grad_norm": 0.64628666639328, + "kl": 0.1204833984375, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 220757648.0, + "reward": 1.6053574085235596, + "reward_std": 0.09190364181995392, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6053571701049805, + "rewards/curriculum_aware_reward_fn/std": 0.42486655712127686, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 356.52679443359375, + "completions/mean_terminated_length": 356.52679443359375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 1.9491875161207117, + "grad_norm": 0.877808690071106, + "kl": 0.1322021484375, + "learning_rate": 1e-06, + "loss": 0.0202, + "num_tokens": 220858108.0, + "reward": 1.5473216772079468, + "reward_std": 0.15397398173809052, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5473214387893677, + "rewards/curriculum_aware_reward_fn/std": 0.4423442482948303, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 405.08038330078125, + "completions/mean_terminated_length": 405.08038330078125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 1.9502192416817126, + "grad_norm": 0.8030251264572144, + "kl": 0.1214599609375, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 220969599.0, + "reward": 1.581696629524231, + "reward_std": 0.27979207038879395, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5816964507102966, + "rewards/curriculum_aware_reward_fn/std": 0.45152372121810913, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 446.8125305175781, + "completions/mean_terminated_length": 446.8125305175781, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 1.9512509672427134, + "grad_norm": 0.7618669271469116, + "kl": 0.112548828125, + "learning_rate": 1e-06, + "loss": -0.0191, + "num_tokens": 221093793.0, + "reward": 1.5339287519454956, + "reward_std": 0.1955319494009018, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5339285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4265519678592682, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 491.08038330078125, + "completions/mean_terminated_length": 491.08038330078125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 1.9522826928037142, + "grad_norm": 0.8040339350700378, + "kl": 0.108642578125, + "learning_rate": 1e-06, + "loss": 0.015, + "num_tokens": 221218843.0, + "reward": 1.4285714626312256, + "reward_std": 0.23945212364196777, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4285714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.4297869801521301, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 398.5446472167969, + "completions/mean_terminated_length": 398.5446472167969, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 1.953314418364715, + "grad_norm": 0.6367735266685486, + "kl": 0.1279296875, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 221325871.0, + "reward": 1.5861608982086182, + "reward_std": 0.1092190220952034, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838, + "rewards/curriculum_aware_reward_fn/std": 0.43481433391571045, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 435.5000305175781, + "completions/mean_terminated_length": 435.5000305175781, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 1.9543461439257157, + "grad_norm": 0.738890528678894, + "kl": 0.1275634765625, + "learning_rate": 1e-06, + "loss": 0.0317, + "num_tokens": 221447587.0, + "reward": 1.4392858743667603, + "reward_std": 0.14185898005962372, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4392856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.42930009961128235, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 388.0089416503906, + "completions/mean_terminated_length": 388.0089416503906, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 1.9553778694867165, + "grad_norm": 0.7600980997085571, + "kl": 0.123779296875, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 221554492.0, + "reward": 1.6709821224212646, + "reward_std": 0.1413257122039795, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6709821820259094, + "rewards/curriculum_aware_reward_fn/std": 0.41620543599128723, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 437.0625305175781, + "completions/mean_terminated_length": 437.0625305175781, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 1.9564095950477172, + "grad_norm": 0.7145452499389648, + "kl": 0.11181640625, + "learning_rate": 1e-06, + "loss": 0.0277, + "num_tokens": 221663636.0, + "reward": 1.589285969734192, + "reward_std": 0.13859732449054718, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5892857313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4321238398551941, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2998.0, + "completions/max_terminated_length": 2998.0, + "completions/mean_length": 515.794677734375, + "completions/mean_terminated_length": 515.794677734375, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 1.957441320608718, + "grad_norm": 0.570237398147583, + "kl": 0.1094970703125, + "learning_rate": 1e-06, + "loss": 0.0342, + "num_tokens": 221785686.0, + "reward": 1.4093750715255737, + "reward_std": 0.15295647084712982, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41830354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.45212188363075256, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 401.83929443359375, + "completions/mean_terminated_length": 401.83929443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 1.9584730461697188, + "grad_norm": 0.6868841648101807, + "kl": 0.1219482421875, + "learning_rate": 1e-06, + "loss": -0.0128, + "num_tokens": 221896205.0, + "reward": 1.5933037996292114, + "reward_std": 0.2061375081539154, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5933035612106323, + "rewards/curriculum_aware_reward_fn/std": 0.4708864092826843, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1397.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 465.6875305175781, + "completions/mean_terminated_length": 465.6875305175781, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 1.9595047717307197, + "grad_norm": 0.7461130619049072, + "kl": 0.115234375, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 222008534.0, + "reward": 1.2727680206298828, + "reward_std": 0.18516550958156586, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2727678418159485, + "rewards/curriculum_aware_reward_fn/std": 0.39231860637664795, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 430.0000305175781, + "completions/mean_terminated_length": 430.0000305175781, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 1.9605364972917205, + "grad_norm": 0.7491604685783386, + "kl": 0.1195068359375, + "learning_rate": 1e-06, + "loss": -0.0155, + "num_tokens": 222123719.0, + "reward": 1.4834821224212646, + "reward_std": 0.19401638209819794, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48348215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.44597241282463074, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 414.5446472167969, + "completions/mean_terminated_length": 414.5446472167969, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 1.9615682228527211, + "grad_norm": 0.7671953439712524, + "kl": 0.10888671875, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 222236802.0, + "reward": 1.532589316368103, + "reward_std": 0.18732120096683502, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103, + "rewards/curriculum_aware_reward_fn/std": 0.38488972187042236, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1200.0, + "completions/max_terminated_length": 1200.0, + "completions/mean_length": 460.8660888671875, + "completions/mean_terminated_length": 460.8660888671875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 1.962599948413722, + "grad_norm": 0.7865824699401855, + "kl": 0.1195068359375, + "learning_rate": 1e-06, + "loss": -0.0107, + "num_tokens": 222369955.0, + "reward": 1.4214287996292114, + "reward_std": 0.20716939866542816, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4214286208152771, + "rewards/curriculum_aware_reward_fn/std": 0.4383997619152069, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 453.3482360839844, + "completions/mean_terminated_length": 453.3482360839844, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.9636316739747226, + "grad_norm": 0.8720301985740662, + "kl": 0.1109619140625, + "learning_rate": 1e-06, + "loss": 0.018, + "num_tokens": 222485613.0, + "reward": 1.5343750715255737, + "reward_std": 0.2358490377664566, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.534375011920929, + "rewards/curriculum_aware_reward_fn/std": 0.43599480390548706, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 515.4642944335938, + "completions/mean_terminated_length": 515.4642944335938, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 1.9646633995357234, + "grad_norm": 0.723853349685669, + "kl": 0.105224609375, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 222620912.0, + "reward": 1.378571629524231, + "reward_std": 0.25803887844085693, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37857145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.43130403757095337, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 467.4464416503906, + "completions/mean_terminated_length": 467.4464416503906, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 1.9656951250967243, + "grad_norm": 0.8006047606468201, + "kl": 0.1103515625, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 222734082.0, + "reward": 1.364732265472412, + "reward_std": 0.20015935599803925, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36473211646080017, + "rewards/curriculum_aware_reward_fn/std": 0.3692357838153839, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 507.4107360839844, + "completions/mean_terminated_length": 507.4107360839844, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 1.9667268506577251, + "grad_norm": 0.723853349685669, + "kl": 0.1212158203125, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 222857034.0, + "reward": 1.4633928537368774, + "reward_std": 0.22907757759094238, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46339288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.4332977831363678, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 485.2410888671875, + "completions/mean_terminated_length": 485.2410888671875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 1.967758576218726, + "grad_norm": 0.6630465388298035, + "kl": 0.1126708984375, + "learning_rate": 1e-06, + "loss": -0.0124, + "num_tokens": 222978490.0, + "reward": 1.5424107313156128, + "reward_std": 0.2230999618768692, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4499354362487793, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 1182.0, + "completions/mean_length": 483.83929443359375, + "completions/mean_terminated_length": 483.83929443359375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 1.9687903017797266, + "grad_norm": 0.6432393193244934, + "kl": 0.120361328125, + "learning_rate": 1e-06, + "loss": 0.0293, + "num_tokens": 223105697.0, + "reward": 1.5803571939468384, + "reward_std": 0.15816918015480042, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5803571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.411838173866272, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 442.3660888671875, + "completions/mean_terminated_length": 442.3660888671875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.9698220273407274, + "grad_norm": 0.7805054783821106, + "kl": 0.1103515625, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 223216114.0, + "reward": 1.4758929014205933, + "reward_std": 0.1755753457546234, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4758928418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4153573215007782, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 532.3125, + "completions/mean_terminated_length": 532.3125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 1.970853752901728, + "grad_norm": 0.7325468063354492, + "kl": 0.10595703125, + "learning_rate": 1e-06, + "loss": -0.0275, + "num_tokens": 223338600.0, + "reward": 1.4348214864730835, + "reward_std": 0.2412104308605194, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43482139706611633, + "rewards/curriculum_aware_reward_fn/std": 0.4159456491470337, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1092.0, + "completions/max_terminated_length": 1092.0, + "completions/mean_length": 416.1964416503906, + "completions/mean_terminated_length": 416.1964416503906, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 1.9718854784627289, + "grad_norm": 0.7689169645309448, + "kl": 0.12548828125, + "learning_rate": 1e-06, + "loss": -0.0099, + "num_tokens": 223443972.0, + "reward": 1.6049107313156128, + "reward_std": 0.2390371859073639, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6138392090797424, + "rewards/curriculum_aware_reward_fn/std": 0.40348759293556213, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1235.0, + "completions/max_terminated_length": 1235.0, + "completions/mean_length": 496.7410888671875, + "completions/mean_terminated_length": 496.7410888671875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 1.9729172040237297, + "grad_norm": 0.7692927718162537, + "kl": 0.11083984375, + "learning_rate": 1e-06, + "loss": 0.0252, + "num_tokens": 223573730.0, + "reward": 1.349107265472412, + "reward_std": 0.23767122626304626, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34910711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.4030839204788208, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 500.5982360839844, + "completions/mean_terminated_length": 500.5982360839844, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 1.9739489295847306, + "grad_norm": 0.8327507376670837, + "kl": 0.1146240234375, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 223691215.0, + "reward": 1.4968750476837158, + "reward_std": 0.24476194381713867, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49687498807907104, + "rewards/curriculum_aware_reward_fn/std": 0.4174289107322693, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 510.8660888671875, + "completions/mean_terminated_length": 510.8660888671875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 1.9749806551457314, + "grad_norm": 0.761043131351471, + "kl": 0.114501953125, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 223823353.0, + "reward": 1.3392857313156128, + "reward_std": 0.18880169093608856, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3482142984867096, + "rewards/curriculum_aware_reward_fn/std": 0.4080788195133209, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1128.0, + "completions/max_terminated_length": 1128.0, + "completions/mean_length": 475.3214416503906, + "completions/mean_terminated_length": 475.3214416503906, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.976012380706732, + "grad_norm": 0.7875248789787292, + "kl": 0.11572265625, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 223941633.0, + "reward": 1.4611608982086182, + "reward_std": 0.1961316615343094, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46116071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.4118027985095978, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 507.9732360839844, + "completions/mean_terminated_length": 507.9732360839844, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 1.9770441062677326, + "grad_norm": 0.723215639591217, + "kl": 0.107421875, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 224073559.0, + "reward": 1.4205358028411865, + "reward_std": 0.1860450804233551, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42053571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4325992166996002, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 459.9464416503906, + "completions/mean_terminated_length": 459.9464416503906, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 1.9780758318287335, + "grad_norm": 0.8373486399650574, + "kl": 0.121826171875, + "learning_rate": 1e-06, + "loss": -0.0095, + "num_tokens": 224199749.0, + "reward": 1.4727680683135986, + "reward_std": 0.32642921805381775, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4727678596973419, + "rewards/curriculum_aware_reward_fn/std": 0.5004002451896667, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 458.08038330078125, + "completions/mean_terminated_length": 458.08038330078125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 1.9791075573897343, + "grad_norm": 0.7150551676750183, + "kl": 0.1224365234375, + "learning_rate": 1e-06, + "loss": 0.025, + "num_tokens": 224321537.0, + "reward": 1.579017996788025, + "reward_std": 0.1882287859916687, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5790178179740906, + "rewards/curriculum_aware_reward_fn/std": 0.3917297124862671, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 433.15179443359375, + "completions/mean_terminated_length": 433.15179443359375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.9801392829507352, + "grad_norm": 0.6942814588546753, + "kl": 0.1180419921875, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 224430936.0, + "reward": 1.6250001192092896, + "reward_std": 0.15352478623390198, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.625, + "rewards/curriculum_aware_reward_fn/std": 0.4262765645980835, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 477.8839416503906, + "completions/mean_terminated_length": 477.8839416503906, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 1.981171008511736, + "grad_norm": 0.7555545568466187, + "kl": 0.1136474609375, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 224546801.0, + "reward": 1.4424108266830444, + "reward_std": 0.23390239477157593, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4513393044471741, + "rewards/curriculum_aware_reward_fn/std": 0.46204888820648193, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 485.95538330078125, + "completions/mean_terminated_length": 485.95538330078125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 1.9822027340727366, + "grad_norm": 0.8747866153717041, + "kl": 0.118408203125, + "learning_rate": 1e-06, + "loss": 0.0209, + "num_tokens": 224676212.0, + "reward": 1.4625002145767212, + "reward_std": 0.28620848059654236, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4625000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.39229631423950195, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 447.0535888671875, + "completions/mean_terminated_length": 447.0535888671875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 1.9832344596337375, + "grad_norm": 0.6834079027175903, + "kl": 0.1190185546875, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 224787147.0, + "reward": 1.5656250715255737, + "reward_std": 0.21303489804267883, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.565625011920929, + "rewards/curriculum_aware_reward_fn/std": 0.4493253827095032, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 457.169677734375, + "completions/mean_terminated_length": 457.169677734375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 1.984266185194738, + "grad_norm": 0.6467311978340149, + "kl": 0.111572265625, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 224904654.0, + "reward": 1.4964287281036377, + "reward_std": 0.21329189836978912, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5053571462631226, + "rewards/curriculum_aware_reward_fn/std": 0.43719813227653503, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1199.0, + "completions/max_terminated_length": 1199.0, + "completions/mean_length": 493.107177734375, + "completions/mean_terminated_length": 493.107177734375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 1.985297910755739, + "grad_norm": 0.7250528931617737, + "kl": 0.1156005859375, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 225029518.0, + "reward": 1.4723213911056519, + "reward_std": 0.19657452404499054, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47232145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.41202855110168457, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1174.0, + "completions/max_terminated_length": 1174.0, + "completions/mean_length": 473.6607360839844, + "completions/mean_terminated_length": 473.6607360839844, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 1.9863296363167398, + "grad_norm": 0.6025116443634033, + "kl": 0.103271484375, + "learning_rate": 1e-06, + "loss": -0.0156, + "num_tokens": 225151364.0, + "reward": 1.463392972946167, + "reward_std": 0.17400220036506653, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46339288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.458060085773468, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 484.9107360839844, + "completions/mean_terminated_length": 484.9107360839844, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 1.9873613618777406, + "grad_norm": 0.7289581298828125, + "kl": 0.116943359375, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 225285775.0, + "reward": 1.4758929014205933, + "reward_std": 0.2225145697593689, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4758928120136261, + "rewards/curriculum_aware_reward_fn/std": 0.43376532196998596, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 475.2589416503906, + "completions/mean_terminated_length": 475.2589416503906, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 1.9883930874387414, + "grad_norm": 0.7278993129730225, + "kl": 0.1214599609375, + "learning_rate": 1e-06, + "loss": -0.023, + "num_tokens": 225401234.0, + "reward": 1.3933037519454956, + "reward_std": 0.13688452541828156, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3933035731315613, + "rewards/curriculum_aware_reward_fn/std": 0.3881984055042267, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1149.0, + "completions/max_terminated_length": 1149.0, + "completions/mean_length": 492.3660888671875, + "completions/mean_terminated_length": 492.3660888671875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 1.989424812999742, + "grad_norm": 0.6222229599952698, + "kl": 0.1170654296875, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 225519750.0, + "reward": 1.5058035850524902, + "reward_std": 0.10875215381383896, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5058035850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4427974820137024, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 475.7410888671875, + "completions/mean_terminated_length": 475.7410888671875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 1.990456538560743, + "grad_norm": 0.6316165328025818, + "kl": 0.114990234375, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 225633576.0, + "reward": 1.466071605682373, + "reward_std": 0.14979317784309387, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4660714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.44429388642311096, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 471.9732360839844, + "completions/mean_terminated_length": 471.9732360839844, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 1.9914882641217435, + "grad_norm": 0.9416745901107788, + "kl": 0.1495361328125, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 225761036.0, + "reward": 1.4834821224212646, + "reward_std": 0.23354566097259521, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48348215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.41413038969039917, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 510.3839416503906, + "completions/mean_terminated_length": 510.3839416503906, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 1.9925199896827444, + "grad_norm": 0.7914566993713379, + "kl": 0.1246337890625, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 225880345.0, + "reward": 1.4580358266830444, + "reward_std": 0.2331819087266922, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4669643044471741, + "rewards/curriculum_aware_reward_fn/std": 0.4202934205532074, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 454.5714416503906, + "completions/mean_terminated_length": 454.5714416503906, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 1.9935517152437452, + "grad_norm": 0.7780898213386536, + "kl": 0.114990234375, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 225991891.0, + "reward": 1.440178632736206, + "reward_std": 0.21832521259784698, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4401785433292389, + "rewards/curriculum_aware_reward_fn/std": 0.43463972210884094, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1073.0, + "completions/max_terminated_length": 1073.0, + "completions/mean_length": 484.6160888671875, + "completions/mean_terminated_length": 484.6160888671875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 1.994583440804746, + "grad_norm": 0.7472946047782898, + "kl": 0.1243896484375, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 226113494.0, + "reward": 1.493303656578064, + "reward_std": 0.17449244856834412, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4933035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.44089874625205994, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 435.7321472167969, + "completions/mean_terminated_length": 435.7321472167969, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 1.9956151663657469, + "grad_norm": 0.8039238452911377, + "kl": 0.12939453125, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 226225940.0, + "reward": 1.471428632736206, + "reward_std": 0.296146035194397, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4306769371032715, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 511.5535888671875, + "completions/mean_terminated_length": 511.5535888671875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 1.9966468919267475, + "grad_norm": 0.6034707427024841, + "kl": 0.1014404296875, + "learning_rate": 1e-06, + "loss": -0.0195, + "num_tokens": 226352324.0, + "reward": 1.3870537281036377, + "reward_std": 0.13577203452587128, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38705354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.44091150164604187, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 481.0357360839844, + "completions/mean_terminated_length": 481.0357360839844, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 1.997678617487748, + "grad_norm": 0.6917321085929871, + "kl": 0.117919921875, + "learning_rate": 1e-06, + "loss": 0.015, + "num_tokens": 226477145.0, + "reward": 1.6187500953674316, + "reward_std": 0.21138519048690796, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6187500357627869, + "rewards/curriculum_aware_reward_fn/std": 0.40886160731315613, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1279.0, + "completions/max_terminated_length": 1279.0, + "completions/mean_length": 517.0892944335938, + "completions/mean_terminated_length": 517.0892944335938, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 1.998710343048749, + "grad_norm": 0.6973819732666016, + "kl": 0.12451171875, + "learning_rate": 1e-06, + "loss": 0.0221, + "num_tokens": 226607495.0, + "reward": 1.3973214626312256, + "reward_std": 0.19465167820453644, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3973214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.43287426233291626, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 586.7000122070312, + "completions/mean_terminated_length": 586.7000122070312, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 1.9997420686097498, + "grad_norm": 0.7009508609771729, + "kl": 0.1156005859375, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 226742568.0, + "reward": 1.4357143640518188, + "reward_std": 0.2279166579246521, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4357142746448517, + "rewards/curriculum_aware_reward_fn/std": 0.42274460196495056, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1184.0, + "completions/max_terminated_length": 1184.0, + "completions/mean_length": 512.6428833007812, + "completions/mean_terminated_length": 512.6428833007812, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 2.001031725561001, + "grad_norm": 0.6595158576965332, + "kl": 0.1156005859375, + "learning_rate": 1e-06, + "loss": -0.0156, + "num_tokens": 226869480.0, + "reward": 1.5250000953674316, + "reward_std": 0.2136206030845642, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5249999761581421, + "rewards/curriculum_aware_reward_fn/std": 0.4552001953125, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 501.294677734375, + "completions/mean_terminated_length": 501.294677734375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.0020634511220017, + "grad_norm": 0.7600402235984802, + "kl": 0.117919921875, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 226996360.0, + "reward": 1.4607144594192505, + "reward_std": 0.19837914407253265, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46071428060531616, + "rewards/curriculum_aware_reward_fn/std": 0.4279339015483856, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1133.0, + "completions/max_terminated_length": 1133.0, + "completions/mean_length": 491.2500305175781, + "completions/mean_terminated_length": 491.2500305175781, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 2.0030951766830025, + "grad_norm": 0.6804510354995728, + "kl": 0.1165771484375, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 227117908.0, + "reward": 1.6517857313156128, + "reward_std": 0.15756270289421082, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6517857313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4126342833042145, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 457.83929443359375, + "completions/mean_terminated_length": 457.83929443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.004126902244003, + "grad_norm": 0.7952170372009277, + "kl": 0.1226806640625, + "learning_rate": 1e-06, + "loss": 0.0389, + "num_tokens": 227227538.0, + "reward": 1.5852677822113037, + "reward_std": 0.1368647962808609, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5852679014205933, + "rewards/curriculum_aware_reward_fn/std": 0.430044949054718, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 480.794677734375, + "completions/mean_terminated_length": 480.794677734375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.0051586278050038, + "grad_norm": 0.7373249530792236, + "kl": 0.12255859375, + "learning_rate": 1e-06, + "loss": -0.0114, + "num_tokens": 227346823.0, + "reward": 1.4571430683135986, + "reward_std": 0.2077142298221588, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4660714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.4125407040119171, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 512.5267944335938, + "completions/mean_terminated_length": 512.5267944335938, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 2.0061903533660046, + "grad_norm": 0.7481683492660522, + "kl": 0.11279296875, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 227473050.0, + "reward": 1.4656251668930054, + "reward_std": 0.2195693999528885, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.4202118217945099, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 476.3750305175781, + "completions/mean_terminated_length": 476.3750305175781, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 2.0072220789270054, + "grad_norm": 0.7075243592262268, + "kl": 0.110107421875, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 227591231.0, + "reward": 1.5468751192092896, + "reward_std": 0.22008267045021057, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5468749403953552, + "rewards/curriculum_aware_reward_fn/std": 0.41515660285949707, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1507.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 458.6339416503906, + "completions/mean_terminated_length": 458.6339416503906, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 2.0082538044880063, + "grad_norm": 0.6288896799087524, + "kl": 0.1248779296875, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 227709882.0, + "reward": 1.6352678537368774, + "reward_std": 0.11739077419042587, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6352678537368774, + "rewards/curriculum_aware_reward_fn/std": 0.42983540892601013, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3775.0, + "completions/max_terminated_length": 3775.0, + "completions/mean_length": 556.3839721679688, + "completions/mean_terminated_length": 556.3839721679688, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.009285530049007, + "grad_norm": 0.6854794025421143, + "kl": 0.10791015625, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 227840167.0, + "reward": 1.472321629524231, + "reward_std": 0.22720672190189362, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47232145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.40945136547088623, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 480.4732360839844, + "completions/mean_terminated_length": 480.4732360839844, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 2.010317255610008, + "grad_norm": 0.7467259764671326, + "kl": 0.1180419921875, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 227969972.0, + "reward": 1.458482265472412, + "reward_std": 0.2527380883693695, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45848211646080017, + "rewards/curriculum_aware_reward_fn/std": 0.4306337237358093, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1184.0, + "completions/max_terminated_length": 1184.0, + "completions/mean_length": 477.83929443359375, + "completions/mean_terminated_length": 477.83929443359375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 2.0113489811710084, + "grad_norm": 0.7228710651397705, + "kl": 0.10400390625, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 228084089.0, + "reward": 1.5075894594192505, + "reward_std": 0.15247538685798645, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5075892806053162, + "rewards/curriculum_aware_reward_fn/std": 0.4195777475833893, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 1355.0, + "completions/mean_length": 481.0625305175781, + "completions/mean_terminated_length": 481.0625305175781, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.012380706732009, + "grad_norm": 0.8138713836669922, + "kl": 0.1185302734375, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 228207065.0, + "reward": 1.489285945892334, + "reward_std": 0.19530774652957916, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4892857074737549, + "rewards/curriculum_aware_reward_fn/std": 0.39298057556152344, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 481.5089416503906, + "completions/mean_terminated_length": 481.5089416503906, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 2.01341243229301, + "grad_norm": 0.7092810273170471, + "kl": 0.1160888671875, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 228322559.0, + "reward": 1.519196629524231, + "reward_std": 0.20195798575878143, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.528124988079071, + "rewards/curriculum_aware_reward_fn/std": 0.45046430826187134, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 433.2589416503906, + "completions/mean_terminated_length": 433.2589416503906, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 2.014444157854011, + "grad_norm": 0.679499626159668, + "kl": 0.130859375, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 228432418.0, + "reward": 1.6607143878936768, + "reward_std": 0.16779665648937225, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6607142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.41298893094062805, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1436.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 497.0982360839844, + "completions/mean_terminated_length": 497.0982360839844, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 2.0154758834150117, + "grad_norm": 0.7596502304077148, + "kl": 0.13232421875, + "learning_rate": 1e-06, + "loss": 0.0314, + "num_tokens": 228550869.0, + "reward": 1.43348228931427, + "reward_std": 0.14467096328735352, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43348217010498047, + "rewards/curriculum_aware_reward_fn/std": 0.5326877236366272, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1593.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 538.9910888671875, + "completions/mean_terminated_length": 538.9910888671875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 2.0165076089760126, + "grad_norm": 0.6555126905441284, + "kl": 0.1097412109375, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 228693178.0, + "reward": 1.4982144832611084, + "reward_std": 0.17982201278209686, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.5427212119102478, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 456.76788330078125, + "completions/mean_terminated_length": 456.76788330078125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 2.017539334537013, + "grad_norm": 0.8210303783416748, + "kl": 0.118408203125, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 228805950.0, + "reward": 1.419196605682373, + "reward_std": 0.19702807068824768, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41919639706611633, + "rewards/curriculum_aware_reward_fn/std": 0.3827499747276306, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 457.7410888671875, + "completions/mean_terminated_length": 457.7410888671875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 2.018571060098014, + "grad_norm": 0.6348860859870911, + "kl": 0.1182861328125, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 228925901.0, + "reward": 1.4950894117355347, + "reward_std": 0.1397673785686493, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4950893223285675, + "rewards/curriculum_aware_reward_fn/std": 0.4401044249534607, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 469.6160888671875, + "completions/mean_terminated_length": 469.6160888671875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 2.0196027856590146, + "grad_norm": 0.6954073905944824, + "kl": 0.1123046875, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 229040315.0, + "reward": 1.3888394832611084, + "reward_std": 0.15426510572433472, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3888392746448517, + "rewards/curriculum_aware_reward_fn/std": 0.39879944920539856, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 514.9375, + "completions/mean_terminated_length": 514.9375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 2.0206345112200155, + "grad_norm": 0.7137829065322876, + "kl": 0.11083984375, + "learning_rate": 1e-06, + "loss": 0.0218, + "num_tokens": 229172391.0, + "reward": 1.4209822416305542, + "reward_std": 0.2412976324558258, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42098215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.4099350869655609, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 466.9285888671875, + "completions/mean_terminated_length": 466.9285888671875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 2.0216662367810163, + "grad_norm": 0.6483295559883118, + "kl": 0.1134033203125, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 229287953.0, + "reward": 1.516964316368103, + "reward_std": 0.17437413334846497, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.516964316368103, + "rewards/curriculum_aware_reward_fn/std": 0.42762458324432373, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 383.08038330078125, + "completions/mean_terminated_length": 383.08038330078125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 2.022697962342017, + "grad_norm": 0.8443179726600647, + "kl": 0.1192626953125, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 229382812.0, + "reward": 1.5861607789993286, + "reward_std": 0.18597619235515594, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838, + "rewards/curriculum_aware_reward_fn/std": 0.43434789776802063, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1641.0, + "completions/max_terminated_length": 1641.0, + "completions/mean_length": 518.375, + "completions/mean_terminated_length": 518.375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 2.023729687903018, + "grad_norm": 0.6534212827682495, + "kl": 0.0992431640625, + "learning_rate": 1e-06, + "loss": -0.0172, + "num_tokens": 229510891.0, + "reward": 1.4928573369979858, + "reward_std": 0.1748374104499817, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4928571581840515, + "rewards/curriculum_aware_reward_fn/std": 0.45231422781944275, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1637.0, + "completions/max_terminated_length": 1637.0, + "completions/mean_length": 496.2232360839844, + "completions/mean_terminated_length": 496.2232360839844, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.0247614134640184, + "grad_norm": 0.6812605857849121, + "kl": 0.109619140625, + "learning_rate": 1e-06, + "loss": -0.0112, + "num_tokens": 229633220.0, + "reward": 1.4276787042617798, + "reward_std": 0.17530037462711334, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.39585912227630615, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1231.0, + "completions/max_terminated_length": 1231.0, + "completions/mean_length": 505.1785888671875, + "completions/mean_terminated_length": 505.1785888671875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 2.0257931390250192, + "grad_norm": 0.6694918870925903, + "kl": 0.1153564453125, + "learning_rate": 1e-06, + "loss": -0.012, + "num_tokens": 229768067.0, + "reward": 1.3674107789993286, + "reward_std": 0.17926202714443207, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.41163864731788635, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 391.4464416503906, + "completions/mean_terminated_length": 391.4464416503906, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 2.02682486458602, + "grad_norm": 0.8477822542190552, + "kl": 0.1187744140625, + "learning_rate": 1e-06, + "loss": -0.0256, + "num_tokens": 229870009.0, + "reward": 1.5107144117355347, + "reward_std": 0.12055130302906036, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5107142925262451, + "rewards/curriculum_aware_reward_fn/std": 0.4163563549518585, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1400.0, + "completions/max_terminated_length": 1400.0, + "completions/mean_length": 458.51788330078125, + "completions/mean_terminated_length": 458.51788330078125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 2.027856590147021, + "grad_norm": 0.7395011782646179, + "kl": 0.1177978515625, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 229999322.0, + "reward": 1.4848215579986572, + "reward_std": 0.254284143447876, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4848214089870453, + "rewards/curriculum_aware_reward_fn/std": 0.44599515199661255, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3908.0, + "completions/max_terminated_length": 3908.0, + "completions/mean_length": 487.5357360839844, + "completions/mean_terminated_length": 487.5357360839844, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 2.0288883157080218, + "grad_norm": 0.7274074554443359, + "kl": 0.1099853515625, + "learning_rate": 1e-06, + "loss": -0.0304, + "num_tokens": 230124360.0, + "reward": 1.4107143878936768, + "reward_std": 0.19167140126228333, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096, + "rewards/curriculum_aware_reward_fn/std": 0.3991222679615021, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 455.857177734375, + "completions/mean_terminated_length": 455.857177734375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 2.0299200412690226, + "grad_norm": 0.776483416557312, + "kl": 0.11767578125, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 230240411.0, + "reward": 1.6031250953674316, + "reward_std": 0.21919363737106323, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6031250357627869, + "rewards/curriculum_aware_reward_fn/std": 0.41423332691192627, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 410.95538330078125, + "completions/mean_terminated_length": 410.95538330078125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 2.0309517668300234, + "grad_norm": 0.8736270666122437, + "kl": 0.1171875, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 230345553.0, + "reward": 1.5915179252624512, + "reward_std": 0.28784704208374023, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5915178656578064, + "rewards/curriculum_aware_reward_fn/std": 0.42116785049438477, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 472.46429443359375, + "completions/mean_terminated_length": 472.46429443359375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 2.031983492391024, + "grad_norm": 0.7397417426109314, + "kl": 0.120849609375, + "learning_rate": 1e-06, + "loss": -0.0099, + "num_tokens": 230474776.0, + "reward": 1.4187500476837158, + "reward_std": 0.18822595477104187, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41875001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.39763516187667847, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 442.02679443359375, + "completions/mean_terminated_length": 442.02679443359375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.0330152179520247, + "grad_norm": 0.6692867279052734, + "kl": 0.1123046875, + "learning_rate": 1e-06, + "loss": 0.0179, + "num_tokens": 230596495.0, + "reward": 1.430803656578064, + "reward_std": 0.19043758511543274, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4308035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.46688762307167053, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 447.08038330078125, + "completions/mean_terminated_length": 414.20721435546875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.0340469435130255, + "grad_norm": 0.8106693029403687, + "kl": 0.11962890625, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 230709484.0, + "reward": 1.614732265472412, + "reward_std": 0.18752601742744446, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6147322058677673, + "rewards/curriculum_aware_reward_fn/std": 0.430044949054718, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 402.95538330078125, + "completions/mean_terminated_length": 402.95538330078125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 2.0350786690740263, + "grad_norm": 0.7740663886070251, + "kl": 0.1324462890625, + "learning_rate": 1e-06, + "loss": 0.0214, + "num_tokens": 230830439.0, + "reward": 1.4558035135269165, + "reward_std": 0.2036939114332199, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45580360293388367, + "rewards/curriculum_aware_reward_fn/std": 0.4446248412132263, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 399.6964416503906, + "completions/mean_terminated_length": 399.6964416503906, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 2.036110394635027, + "grad_norm": 0.6959367990493774, + "kl": 0.111572265625, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 230939528.0, + "reward": 1.5232144594192505, + "reward_std": 0.14704690873622894, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5232142806053162, + "rewards/curriculum_aware_reward_fn/std": 0.4357311427593231, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 429.8214416503906, + "completions/mean_terminated_length": 429.8214416503906, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.037142120196028, + "grad_norm": 0.8482027053833008, + "kl": 0.128173828125, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 231052964.0, + "reward": 1.505357265472412, + "reward_std": 0.23154647648334503, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5053571462631226, + "rewards/curriculum_aware_reward_fn/std": 0.4438953697681427, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1496.0, + "completions/max_terminated_length": 1496.0, + "completions/mean_length": 463.5625305175781, + "completions/mean_terminated_length": 463.5625305175781, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.0381738457570284, + "grad_norm": 0.7355022430419922, + "kl": 0.110595703125, + "learning_rate": 1e-06, + "loss": -0.0151, + "num_tokens": 231172748.0, + "reward": 1.473660945892334, + "reward_std": 0.19264405965805054, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4736607074737549, + "rewards/curriculum_aware_reward_fn/std": 0.44117775559425354, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 461.232177734375, + "completions/mean_terminated_length": 461.232177734375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.0392055713180293, + "grad_norm": 0.7399516701698303, + "kl": 0.121337890625, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 231292131.0, + "reward": 1.500892996788025, + "reward_std": 0.1874261349439621, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5008928179740906, + "rewards/curriculum_aware_reward_fn/std": 0.4057571291923523, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 430.14288330078125, + "completions/mean_terminated_length": 430.14288330078125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.04023729687903, + "grad_norm": 0.7026528716087341, + "kl": 0.116455078125, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 231408900.0, + "reward": 1.4950892925262451, + "reward_std": 0.23565950989723206, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5040178894996643, + "rewards/curriculum_aware_reward_fn/std": 0.417475163936615, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 386.2946472167969, + "completions/mean_terminated_length": 386.2946472167969, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.041269022440031, + "grad_norm": 0.6979001760482788, + "kl": 0.1278076171875, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 231505288.0, + "reward": 1.579017996788025, + "reward_std": 0.15396980941295624, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5790178179740906, + "rewards/curriculum_aware_reward_fn/std": 0.4313000738620758, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 418.6607360839844, + "completions/mean_terminated_length": 418.6607360839844, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 2.042300748001032, + "grad_norm": 0.623936653137207, + "kl": 0.106201171875, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 231611468.0, + "reward": 1.4375001192092896, + "reward_std": 0.10704384744167328, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4375, + "rewards/curriculum_aware_reward_fn/std": 0.4572489261627197, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 438.3660888671875, + "completions/mean_terminated_length": 438.3660888671875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 2.0433324735620326, + "grad_norm": 0.8500005006790161, + "kl": 0.1148681640625, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 231733932.0, + "reward": 1.4482144117355347, + "reward_std": 0.20263758301734924, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4482142925262451, + "rewards/curriculum_aware_reward_fn/std": 0.4057541489601135, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 435.58929443359375, + "completions/mean_terminated_length": 435.58929443359375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 2.0443641991230335, + "grad_norm": 0.8085100054740906, + "kl": 0.11767578125, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 231847104.0, + "reward": 1.3821427822113037, + "reward_std": 0.1722569614648819, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709, + "rewards/curriculum_aware_reward_fn/std": 0.40719079971313477, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 440.0625305175781, + "completions/mean_terminated_length": 440.0625305175781, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 2.045395924684034, + "grad_norm": 0.8003664612770081, + "kl": 0.1328125, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 231971060.0, + "reward": 1.4665179252624512, + "reward_std": 0.2330600619316101, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4665178656578064, + "rewards/curriculum_aware_reward_fn/std": 0.43689054250717163, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1208.0, + "completions/max_terminated_length": 1208.0, + "completions/mean_length": 459.77679443359375, + "completions/mean_terminated_length": 459.77679443359375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 2.0464276502450347, + "grad_norm": 0.877363383769989, + "kl": 0.127685546875, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 232103818.0, + "reward": 1.516517996788025, + "reward_std": 0.22694124281406403, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5165178179740906, + "rewards/curriculum_aware_reward_fn/std": 0.3971967399120331, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 454.58038330078125, + "completions/mean_terminated_length": 454.58038330078125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 2.0474593758060355, + "grad_norm": 0.721016526222229, + "kl": 0.1104736328125, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 232211627.0, + "reward": 1.4062501192092896, + "reward_std": 0.17761258780956268, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4151785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.4367591440677643, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1186.0, + "completions/max_terminated_length": 1186.0, + "completions/mean_length": 427.1250305175781, + "completions/mean_terminated_length": 427.1250305175781, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.0484911013670364, + "grad_norm": 0.8724990487098694, + "kl": 0.125244140625, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 232318846.0, + "reward": 1.532589316368103, + "reward_std": 0.19645501673221588, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103, + "rewards/curriculum_aware_reward_fn/std": 0.39749833941459656, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 461.2589416503906, + "completions/mean_terminated_length": 461.2589416503906, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.0495228269280372, + "grad_norm": 0.7752722501754761, + "kl": 0.1173095703125, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 232438113.0, + "reward": 1.3625000715255737, + "reward_std": 0.2078167051076889, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.37142857909202576, + "rewards/curriculum_aware_reward_fn/std": 0.42483246326446533, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 445.4464416503906, + "completions/mean_terminated_length": 445.4464416503906, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.050554552489038, + "grad_norm": 0.7875111103057861, + "kl": 0.125244140625, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 232558479.0, + "reward": 1.469642996788025, + "reward_std": 0.20513944327831268, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296, + "rewards/curriculum_aware_reward_fn/std": 0.500376284122467, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 458.3482360839844, + "completions/mean_terminated_length": 458.3482360839844, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.051586278050039, + "grad_norm": 0.9260916113853455, + "kl": 0.1683349609375, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 232678743.0, + "reward": 1.3781250715255737, + "reward_std": 0.2220853716135025, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.38705354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.4021756947040558, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 469.01788330078125, + "completions/mean_terminated_length": 469.01788330078125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.0526180036110393, + "grad_norm": 0.8016949892044067, + "kl": 0.11279296875, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 232797059.0, + "reward": 1.4160715341567993, + "reward_std": 0.24108101427555084, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41607141494750977, + "rewards/curriculum_aware_reward_fn/std": 0.40842556953430176, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 449.46429443359375, + "completions/mean_terminated_length": 449.46429443359375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 2.05364972917204, + "grad_norm": 0.7085789442062378, + "kl": 0.135009765625, + "learning_rate": 1e-06, + "loss": 0.0379, + "num_tokens": 232920269.0, + "reward": 1.4468750953674316, + "reward_std": 0.18384109437465668, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4558035731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4252050817012787, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 414.51788330078125, + "completions/mean_terminated_length": 414.51788330078125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 2.054681454733041, + "grad_norm": 0.7596367597579956, + "kl": 0.1248779296875, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 233034840.0, + "reward": 1.6830357313156128, + "reward_std": 0.19633789360523224, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6830357313156128, + "rewards/curriculum_aware_reward_fn/std": 0.3981688916683197, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 434.7321472167969, + "completions/mean_terminated_length": 434.7321472167969, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 2.055713180294042, + "grad_norm": 0.7070798277854919, + "kl": 0.128173828125, + "learning_rate": 1e-06, + "loss": -0.048, + "num_tokens": 233153936.0, + "reward": 1.474107265472412, + "reward_std": 0.1844610571861267, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.48303571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.44309282302856445, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 471.20538330078125, + "completions/mean_terminated_length": 471.20538330078125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 2.0567449058550427, + "grad_norm": 0.7285985946655273, + "kl": 0.1134033203125, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 233285020.0, + "reward": 1.352678656578064, + "reward_std": 0.2566676139831543, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.3705357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.3969224691390991, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 456.9285888671875, + "completions/mean_terminated_length": 456.9285888671875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 2.0577766314160435, + "grad_norm": 0.6904830932617188, + "kl": 0.12255859375, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 233403886.0, + "reward": 1.4937502145767212, + "reward_std": 0.22515329718589783, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.5205357670783997, + "rewards/curriculum_aware_reward_fn/std": 0.43436571955680847, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 435.01788330078125, + "completions/mean_terminated_length": 435.01788330078125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.058808356977044, + "grad_norm": 0.8098060488700867, + "kl": 0.121337890625, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 233517442.0, + "reward": 1.4343750476837158, + "reward_std": 0.31481003761291504, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.46116071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.44442036747932434, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 471.8482360839844, + "completions/mean_terminated_length": 439.1982116699219, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 2.0598400825380447, + "grad_norm": 0.8379706740379333, + "kl": 0.1239013671875, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 233646750.0, + "reward": 1.5308037996292114, + "reward_std": 0.30953019857406616, + "rewards/code_format_reward/mean": 0.9375, + "rewards/code_format_reward/std": 0.24314938485622406, + "rewards/curriculum_aware_reward_fn/mean": 0.5933036208152771, + "rewards/curriculum_aware_reward_fn/std": 0.4207723140716553, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1083.0, + "completions/max_terminated_length": 1083.0, + "completions/mean_length": 415.5446472167969, + "completions/mean_terminated_length": 415.5446472167969, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 2.0608718080990456, + "grad_norm": 0.7319987416267395, + "kl": 0.1361083984375, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 233756263.0, + "reward": 1.6250001192092896, + "reward_std": 0.22156484425067902, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.6428571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4175139367580414, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 432.5089416503906, + "completions/mean_terminated_length": 432.5089416503906, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 2.0619035336600464, + "grad_norm": 0.8287761807441711, + "kl": 0.12109375, + "learning_rate": 1e-06, + "loss": -0.0228, + "num_tokens": 233869881.0, + "reward": 1.5004465579986572, + "reward_std": 0.25090718269348145, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5004464387893677, + "rewards/curriculum_aware_reward_fn/std": 0.433012455701828, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 394.9910888671875, + "completions/mean_terminated_length": 394.9910888671875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 2.0629352592210473, + "grad_norm": 0.6038341522216797, + "kl": 0.13330078125, + "learning_rate": 1e-06, + "loss": -0.0148, + "num_tokens": 233975223.0, + "reward": 1.5892857313156128, + "reward_std": 0.1025947853922844, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5892857313156128, + "rewards/curriculum_aware_reward_fn/std": 0.43709877133369446, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 432.83038330078125, + "completions/mean_terminated_length": 432.83038330078125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 2.063966984782048, + "grad_norm": 0.7323113083839417, + "kl": 0.132568359375, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 234087932.0, + "reward": 1.4352679252624512, + "reward_std": 0.2148611694574356, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4441964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.41675201058387756, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 478.8750305175781, + "completions/mean_terminated_length": 478.8750305175781, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 2.064998710343049, + "grad_norm": 0.9021128416061401, + "kl": 0.1153564453125, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 234217250.0, + "reward": 1.4133931398391724, + "reward_std": 0.30901095271110535, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4133928418159485, + "rewards/curriculum_aware_reward_fn/std": 0.39643582701683044, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1223.0, + "completions/max_terminated_length": 1223.0, + "completions/mean_length": 466.5000305175781, + "completions/mean_terminated_length": 466.5000305175781, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.0660304359040493, + "grad_norm": 0.7210460901260376, + "kl": 0.1270751953125, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 234334227.0, + "reward": 1.4799107313156128, + "reward_std": 0.15783853828907013, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4799107015132904, + "rewards/curriculum_aware_reward_fn/std": 0.4287768006324768, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 430.83038330078125, + "completions/mean_terminated_length": 430.83038330078125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 2.06706216146505, + "grad_norm": 0.7734282612800598, + "kl": 0.11767578125, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 234455770.0, + "reward": 1.5022321939468384, + "reward_std": 0.2398114651441574, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5022321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4340978264808655, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 408.83929443359375, + "completions/mean_terminated_length": 408.83929443359375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 2.068093887026051, + "grad_norm": 0.5895684957504272, + "kl": 0.117431640625, + "learning_rate": 1e-06, + "loss": -0.0146, + "num_tokens": 234568732.0, + "reward": 1.520982265472412, + "reward_std": 0.11030647158622742, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5209821462631226, + "rewards/curriculum_aware_reward_fn/std": 0.46916821599006653, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1336.0, + "completions/max_terminated_length": 1336.0, + "completions/mean_length": 442.08929443359375, + "completions/mean_terminated_length": 442.08929443359375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 2.069125612587052, + "grad_norm": 0.7536056041717529, + "kl": 0.1180419921875, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 234681575.0, + "reward": 1.4781250953674316, + "reward_std": 0.18545761704444885, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47812503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.39179542660713196, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1529.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 507.4464416503906, + "completions/mean_terminated_length": 507.4464416503906, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 2.0701573381480527, + "grad_norm": 0.7618375420570374, + "kl": 0.119873046875, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 234820911.0, + "reward": 1.3924107551574707, + "reward_std": 0.2358359545469284, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3924107253551483, + "rewards/curriculum_aware_reward_fn/std": 0.4042683243751526, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1183.0, + "completions/max_terminated_length": 1183.0, + "completions/mean_length": 433.6071472167969, + "completions/mean_terminated_length": 433.6071472167969, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 2.0711890637090535, + "grad_norm": 0.5803200602531433, + "kl": 0.1112060546875, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 234936976.0, + "reward": 1.5857144594192505, + "reward_std": 0.173648864030838, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5857143402099609, + "rewards/curriculum_aware_reward_fn/std": 0.4373563528060913, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 407.9196472167969, + "completions/mean_terminated_length": 407.9196472167969, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 2.0722207892700544, + "grad_norm": 0.698860764503479, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 235052831.0, + "reward": 1.5660717487335205, + "reward_std": 0.14689664542675018, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5660714507102966, + "rewards/curriculum_aware_reward_fn/std": 0.3505603075027466, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 435.4821472167969, + "completions/mean_terminated_length": 435.4821472167969, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.0732525148310548, + "grad_norm": 0.8031901121139526, + "kl": 0.119873046875, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 235164283.0, + "reward": 1.432142972946167, + "reward_std": 0.250482439994812, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43214288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.4090671241283417, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 454.732177734375, + "completions/mean_terminated_length": 454.732177734375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 2.0742842403920556, + "grad_norm": 0.8125240206718445, + "kl": 0.106201171875, + "learning_rate": 1e-06, + "loss": -0.0243, + "num_tokens": 235279816.0, + "reward": 1.497321605682373, + "reward_std": 0.28393781185150146, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4973214268684387, + "rewards/curriculum_aware_reward_fn/std": 0.4544989764690399, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 414.58038330078125, + "completions/mean_terminated_length": 414.58038330078125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.0753159659530565, + "grad_norm": 0.7212795615196228, + "kl": 0.117919921875, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 235396557.0, + "reward": 1.5700894594192505, + "reward_std": 0.16203486919403076, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5700892806053162, + "rewards/curriculum_aware_reward_fn/std": 0.4427829384803772, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 412.65179443359375, + "completions/mean_terminated_length": 412.65179443359375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 2.0763476915140573, + "grad_norm": 0.7393515706062317, + "kl": 0.125, + "learning_rate": 1e-06, + "loss": 0.0214, + "num_tokens": 235513838.0, + "reward": 1.594642996788025, + "reward_std": 0.22776253521442413, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5946429371833801, + "rewards/curriculum_aware_reward_fn/std": 0.4215136468410492, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 470.5625305175781, + "completions/mean_terminated_length": 470.5625305175781, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 2.077379417075058, + "grad_norm": 0.6410515308380127, + "kl": 0.1263427734375, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 235642337.0, + "reward": 1.4433037042617798, + "reward_std": 0.1228586882352829, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44330358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.4229079782962799, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 411.9196472167969, + "completions/mean_terminated_length": 411.9196472167969, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.078411142636059, + "grad_norm": 0.6773447394371033, + "kl": 0.10888671875, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 235749351.0, + "reward": 1.645982265472412, + "reward_std": 0.17768552899360657, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6459822058677673, + "rewards/curriculum_aware_reward_fn/std": 0.4375988245010376, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1242.0, + "completions/max_terminated_length": 1242.0, + "completions/mean_length": 443.4285888671875, + "completions/mean_terminated_length": 443.4285888671875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 2.0794428681970594, + "grad_norm": 0.6409063935279846, + "kl": 0.1204833984375, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 235868154.0, + "reward": 1.4526787996292114, + "reward_std": 0.10559501498937607, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45267853140830994, + "rewards/curriculum_aware_reward_fn/std": 0.45071709156036377, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1450.0, + "completions/max_terminated_length": 1450.0, + "completions/mean_length": 473.3750305175781, + "completions/mean_terminated_length": 473.3750305175781, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 2.08047459375806, + "grad_norm": 0.7845041155815125, + "kl": 0.11279296875, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 235984816.0, + "reward": 1.485267996788025, + "reward_std": 0.2349657416343689, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48526784777641296, + "rewards/curriculum_aware_reward_fn/std": 0.40289705991744995, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1621.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 451.919677734375, + "completions/mean_terminated_length": 451.919677734375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 2.081506319319061, + "grad_norm": 0.7908880114555359, + "kl": 0.1138916015625, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 236089653.0, + "reward": 1.6111608743667603, + "reward_std": 0.31487926840782166, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6111606955528259, + "rewards/curriculum_aware_reward_fn/std": 0.5522969961166382, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 419.4464416503906, + "completions/mean_terminated_length": 419.4464416503906, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.082538044880062, + "grad_norm": 0.7708176970481873, + "kl": 0.1177978515625, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 236195263.0, + "reward": 1.563392996788025, + "reward_std": 0.19081752002239227, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5633928179740906, + "rewards/curriculum_aware_reward_fn/std": 0.42452844977378845, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 433.5714416503906, + "completions/mean_terminated_length": 433.5714416503906, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.0835697704410627, + "grad_norm": 0.6971049904823303, + "kl": 0.120849609375, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 236316273.0, + "reward": 1.4839287996292114, + "reward_std": 0.2088971883058548, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4839285910129547, + "rewards/curriculum_aware_reward_fn/std": 0.4364173710346222, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 457.169677734375, + "completions/mean_terminated_length": 457.169677734375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.0846014960020636, + "grad_norm": 0.8070178031921387, + "kl": 0.1220703125, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 236427795.0, + "reward": 1.4075894355773926, + "reward_std": 0.20527462661266327, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40758928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3741181790828705, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 405.8125305175781, + "completions/mean_terminated_length": 405.8125305175781, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.0856332215630644, + "grad_norm": 0.7929508090019226, + "kl": 0.124267578125, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 236541265.0, + "reward": 1.5303572416305542, + "reward_std": 0.1638633906841278, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5303571820259094, + "rewards/curriculum_aware_reward_fn/std": 0.37837710976600647, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 416.3571472167969, + "completions/mean_terminated_length": 416.3571472167969, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.086664947124065, + "grad_norm": 0.8954543471336365, + "kl": 0.12060546875, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 236653214.0, + "reward": 1.5348213911056519, + "reward_std": 0.2690960764884949, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5348214507102966, + "rewards/curriculum_aware_reward_fn/std": 0.4390222132205963, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1526.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 454.2589416503906, + "completions/mean_terminated_length": 454.2589416503906, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.0876966726850656, + "grad_norm": 0.7094557881355286, + "kl": 0.1168212890625, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 236776177.0, + "reward": 1.5254465341567993, + "reward_std": 0.1793346107006073, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098, + "rewards/curriculum_aware_reward_fn/std": 0.4094049632549286, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 429.1607360839844, + "completions/mean_terminated_length": 429.1607360839844, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.0887283982460665, + "grad_norm": 0.740519642829895, + "kl": 0.1337890625, + "learning_rate": 1e-06, + "loss": 0.0209, + "num_tokens": 236897029.0, + "reward": 1.4156250953674316, + "reward_std": 0.18167220056056976, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4156250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.4546571373939514, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 409.5535888671875, + "completions/mean_terminated_length": 409.5535888671875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.0897601238070673, + "grad_norm": 0.6365262269973755, + "kl": 0.1259765625, + "learning_rate": 1e-06, + "loss": -0.0217, + "num_tokens": 237009742.0, + "reward": 1.567857265472412, + "reward_std": 0.15161246061325073, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5678571462631226, + "rewards/curriculum_aware_reward_fn/std": 0.4445291757583618, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 427.3750305175781, + "completions/mean_terminated_length": 394.3243408203125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.090791849368068, + "grad_norm": 0.6970322728157043, + "kl": 0.1146240234375, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 237133222.0, + "reward": 1.6142858266830444, + "reward_std": 0.21715456247329712, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6142857670783997, + "rewards/curriculum_aware_reward_fn/std": 0.41754475235939026, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 464.8750305175781, + "completions/mean_terminated_length": 464.8750305175781, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 2.091823574929069, + "grad_norm": 0.807088315486908, + "kl": 0.113037109375, + "learning_rate": 1e-06, + "loss": 0.0205, + "num_tokens": 237248708.0, + "reward": 1.5125001668930054, + "reward_std": 0.19637462496757507, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.512499988079071, + "rewards/curriculum_aware_reward_fn/std": 0.4132108986377716, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 436.0535888671875, + "completions/mean_terminated_length": 436.0535888671875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 2.09285530049007, + "grad_norm": 0.6729896068572998, + "kl": 0.120849609375, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 237369429.0, + "reward": 1.4919644594192505, + "reward_std": 0.17199517786502838, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5008928775787354, + "rewards/curriculum_aware_reward_fn/std": 0.42304036021232605, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 395.83038330078125, + "completions/mean_terminated_length": 395.83038330078125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.0938870260510702, + "grad_norm": 0.7247878909111023, + "kl": 0.12548828125, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 237475054.0, + "reward": 1.5147322416305542, + "reward_std": 0.18124648928642273, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5236607193946838, + "rewards/curriculum_aware_reward_fn/std": 0.444939523935318, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1456.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 464.0000305175781, + "completions/mean_terminated_length": 464.0000305175781, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 2.094918751612071, + "grad_norm": 0.7838840484619141, + "kl": 0.1431884765625, + "learning_rate": 1e-06, + "loss": 0.0104, + "num_tokens": 237598926.0, + "reward": 1.471428632736206, + "reward_std": 0.2576696574687958, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.48035717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.41931721568107605, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 399.26788330078125, + "completions/mean_terminated_length": 399.26788330078125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 2.095950477173072, + "grad_norm": 0.8283127546310425, + "kl": 0.12158203125, + "learning_rate": 1e-06, + "loss": 0.0458, + "num_tokens": 237712524.0, + "reward": 1.4522322416305542, + "reward_std": 0.22037231922149658, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.444606751203537, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2448.0, + "completions/max_terminated_length": 2448.0, + "completions/mean_length": 399.4821472167969, + "completions/mean_terminated_length": 399.4821472167969, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.0969822027340728, + "grad_norm": 0.612908124923706, + "kl": 0.1302490234375, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 237829779.0, + "reward": 1.6995537281036377, + "reward_std": 0.09788351505994797, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6995535492897034, + "rewards/curriculum_aware_reward_fn/std": 0.3540305495262146, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 444.6875305175781, + "completions/mean_terminated_length": 444.6875305175781, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 2.0980139282950736, + "grad_norm": 0.6735214591026306, + "kl": 0.1121826171875, + "learning_rate": 1e-06, + "loss": -0.0063, + "num_tokens": 237948653.0, + "reward": 1.4866071939468384, + "reward_std": 0.1361234039068222, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4866071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3955257534980774, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 419.6160888671875, + "completions/mean_terminated_length": 386.4955139160156, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 2.0990456538560744, + "grad_norm": 0.6528956294059753, + "kl": 0.118896484375, + "learning_rate": 1e-06, + "loss": 0.0572, + "num_tokens": 238063018.0, + "reward": 1.5607143640518188, + "reward_std": 0.12395340204238892, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5696428418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4645717144012451, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1086.0, + "completions/max_terminated_length": 1086.0, + "completions/mean_length": 473.83038330078125, + "completions/mean_terminated_length": 473.83038330078125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.1000773794170753, + "grad_norm": 0.7285524606704712, + "kl": 0.1185302734375, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 238190441.0, + "reward": 1.4928573369979858, + "reward_std": 0.1570131480693817, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5017856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.4277496337890625, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 423.5982360839844, + "completions/mean_terminated_length": 423.5982360839844, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 2.1011091049780757, + "grad_norm": 0.9682310819625854, + "kl": 0.1566162109375, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 238303852.0, + "reward": 1.406250238418579, + "reward_std": 0.1819208562374115, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40625, + "rewards/curriculum_aware_reward_fn/std": 0.3843284249305725, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1290.0, + "completions/max_terminated_length": 1290.0, + "completions/mean_length": 475.8660888671875, + "completions/mean_terminated_length": 475.8660888671875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 2.1021408305390765, + "grad_norm": 0.7141610980033875, + "kl": 0.1177978515625, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 238430978.0, + "reward": 1.3500001430511475, + "reward_std": 0.1426495909690857, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3499999940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3537444472312927, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 372.9375305175781, + "completions/mean_terminated_length": 372.9375305175781, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 2.1031725561000774, + "grad_norm": 0.9728583693504333, + "kl": 0.14453125, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 238533472.0, + "reward": 1.427232265472412, + "reward_std": 0.24139705300331116, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.4519706070423126, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 426.01788330078125, + "completions/mean_terminated_length": 426.01788330078125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 2.104204281661078, + "grad_norm": 0.7383515238761902, + "kl": 0.1279296875, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 238658921.0, + "reward": 1.6232143640518188, + "reward_std": 0.18711508810520172, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6232143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.383260577917099, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 418.7500305175781, + "completions/mean_terminated_length": 418.7500305175781, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.105236007222079, + "grad_norm": 0.5161622166633606, + "kl": 0.122314453125, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 238778348.0, + "reward": 1.4593751430511475, + "reward_std": 0.08913283795118332, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4593749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.4484975337982178, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 435.6250305175781, + "completions/mean_terminated_length": 435.6250305175781, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 2.10626773278308, + "grad_norm": 0.7753861546516418, + "kl": 0.11962890625, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 238894898.0, + "reward": 1.5455358028411865, + "reward_std": 0.2276323437690735, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.545535683631897, + "rewards/curriculum_aware_reward_fn/std": 0.4044545590877533, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 399.9821472167969, + "completions/mean_terminated_length": 399.9821472167969, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 2.1072994583440803, + "grad_norm": 0.804185152053833, + "kl": 0.1265869140625, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 239016682.0, + "reward": 1.6325894594192505, + "reward_std": 0.16231726109981537, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6325892806053162, + "rewards/curriculum_aware_reward_fn/std": 0.5415073037147522, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 421.89288330078125, + "completions/mean_terminated_length": 421.89288330078125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 2.108331183905081, + "grad_norm": 0.7326617240905762, + "kl": 0.12353515625, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 239130959.0, + "reward": 1.5669643878936768, + "reward_std": 0.2607947587966919, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5669642686843872, + "rewards/curriculum_aware_reward_fn/std": 0.41397616267204285, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1036.0, + "completions/max_terminated_length": 1036.0, + "completions/mean_length": 444.08929443359375, + "completions/mean_terminated_length": 444.08929443359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 2.109362909466082, + "grad_norm": 0.737518310546875, + "kl": 0.1077880859375, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 239252202.0, + "reward": 1.4821429252624512, + "reward_std": 0.20218975841999054, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4821428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.46172693371772766, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1561.0, + "completions/max_terminated_length": 1561.0, + "completions/mean_length": 468.4375305175781, + "completions/mean_terminated_length": 468.4375305175781, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 2.110394635027083, + "grad_norm": 0.7885037064552307, + "kl": 0.1168212890625, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 239374653.0, + "reward": 1.3750001192092896, + "reward_std": 0.17816029489040375, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.375, + "rewards/curriculum_aware_reward_fn/std": 0.380788654088974, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1079.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 401.33929443359375, + "completions/mean_terminated_length": 401.33929443359375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.1114263605880836, + "grad_norm": 0.8109065890312195, + "kl": 0.134033203125, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 239493635.0, + "reward": 1.4455360174179077, + "reward_std": 0.23022522032260895, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3891867995262146, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 445.1964416503906, + "completions/mean_terminated_length": 445.1964416503906, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.1124580861490845, + "grad_norm": 0.7626376152038574, + "kl": 0.131103515625, + "learning_rate": 1e-06, + "loss": -0.025, + "num_tokens": 239622688.0, + "reward": 1.3906251192092896, + "reward_std": 0.18730053305625916, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.390625, + "rewards/curriculum_aware_reward_fn/std": 0.41970235109329224, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1603.0, + "completions/max_terminated_length": 1603.0, + "completions/mean_length": 445.26788330078125, + "completions/mean_terminated_length": 445.26788330078125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 2.1134898117100853, + "grad_norm": 0.7581174373626709, + "kl": 0.1285400390625, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 239731995.0, + "reward": 1.6513394117355347, + "reward_std": 0.24180802702903748, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.6691964268684387, + "rewards/curriculum_aware_reward_fn/std": 0.39107412099838257, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 421.0625305175781, + "completions/mean_terminated_length": 421.0625305175781, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 2.1145215372710857, + "grad_norm": 0.8369876742362976, + "kl": 0.108642578125, + "learning_rate": 1e-06, + "loss": 0.0334, + "num_tokens": 239841048.0, + "reward": 1.5040180683135986, + "reward_std": 0.18611615896224976, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5040178894996643, + "rewards/curriculum_aware_reward_fn/std": 0.42315515875816345, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1976.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 483.107177734375, + "completions/mean_terminated_length": 483.107177734375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 2.1155532628320866, + "grad_norm": 0.6806438565254211, + "kl": 0.10986328125, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 239965016.0, + "reward": 1.3593751192092896, + "reward_std": 0.1829347163438797, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.38082006573677063, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 433.40179443359375, + "completions/mean_terminated_length": 433.40179443359375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 2.1165849883930874, + "grad_norm": 0.7132896780967712, + "kl": 0.12939453125, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 240086000.0, + "reward": 1.2906250953674316, + "reward_std": 0.14403748512268066, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2906250059604645, + "rewards/curriculum_aware_reward_fn/std": 0.37473902106285095, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1810.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 426.9732360839844, + "completions/mean_terminated_length": 426.9732360839844, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.1176167139540882, + "grad_norm": 0.8702093958854675, + "kl": 0.135009765625, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 240196555.0, + "reward": 1.5214285850524902, + "reward_std": 0.2170983999967575, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5214285850524902, + "rewards/curriculum_aware_reward_fn/std": 0.41572222113609314, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3992.0, + "completions/max_terminated_length": 3992.0, + "completions/mean_length": 469.9107360839844, + "completions/mean_terminated_length": 469.9107360839844, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 2.118648439515089, + "grad_norm": 0.5609109997749329, + "kl": 0.104248046875, + "learning_rate": 1e-06, + "loss": -0.0232, + "num_tokens": 240318044.0, + "reward": 1.4709821939468384, + "reward_std": 0.15642313659191132, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4709821343421936, + "rewards/curriculum_aware_reward_fn/std": 0.43281179666519165, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 433.96429443359375, + "completions/mean_terminated_length": 433.96429443359375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 2.11968016507609, + "grad_norm": 0.693916916847229, + "kl": 0.1099853515625, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 240436529.0, + "reward": 1.395535945892334, + "reward_std": 0.15839314460754395, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3955357074737549, + "rewards/curriculum_aware_reward_fn/std": 0.40372997522354126, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3397.0, + "completions/max_terminated_length": 3397.0, + "completions/mean_length": 470.0357360839844, + "completions/mean_terminated_length": 470.0357360839844, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 2.1207118906370903, + "grad_norm": 0.8244123458862305, + "kl": 0.1292724609375, + "learning_rate": 1e-06, + "loss": 0.0302, + "num_tokens": 240553846.0, + "reward": 1.442857265472412, + "reward_std": 0.24676746129989624, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44285711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.4205774962902069, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 408.08038330078125, + "completions/mean_terminated_length": 408.08038330078125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 2.121743616198091, + "grad_norm": 0.5505373477935791, + "kl": 0.1217041015625, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 240662348.0, + "reward": 1.4209821224212646, + "reward_std": 0.1188260167837143, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.42991071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.48632004857063293, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 449.107177734375, + "completions/mean_terminated_length": 449.107177734375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 2.122775341759092, + "grad_norm": 0.638522744178772, + "kl": 0.1121826171875, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 240784188.0, + "reward": 1.4647324085235596, + "reward_std": 0.15416447818279266, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46473217010498047, + "rewards/curriculum_aware_reward_fn/std": 0.4712279438972473, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1054.0, + "completions/max_terminated_length": 1054.0, + "completions/mean_length": 441.0089416503906, + "completions/mean_terminated_length": 441.0089416503906, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 2.123807067320093, + "grad_norm": 0.6578857898712158, + "kl": 0.1224365234375, + "learning_rate": 1e-06, + "loss": -0.0172, + "num_tokens": 240898730.0, + "reward": 1.524553656578064, + "reward_std": 0.16847702860832214, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5245535969734192, + "rewards/curriculum_aware_reward_fn/std": 0.4525202214717865, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 478.2589416503906, + "completions/mean_terminated_length": 445.66668701171875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.1248387928810937, + "grad_norm": 0.8479138016700745, + "kl": 0.122802734375, + "learning_rate": 1e-06, + "loss": 0.0364, + "num_tokens": 241023436.0, + "reward": 1.4089287519454956, + "reward_std": 0.23351380228996277, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41785717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.4163254499435425, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 447.794677734375, + "completions/mean_terminated_length": 447.794677734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.1258705184420945, + "grad_norm": 0.833861231803894, + "kl": 0.1229248046875, + "learning_rate": 1e-06, + "loss": -0.011, + "num_tokens": 241152024.0, + "reward": 1.502678632736206, + "reward_std": 0.2115645557641983, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5026785135269165, + "rewards/curriculum_aware_reward_fn/std": 0.42414936423301697, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2355.0, + "completions/max_terminated_length": 2355.0, + "completions/mean_length": 469.5982360839844, + "completions/mean_terminated_length": 469.5982360839844, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 2.1269022440030954, + "grad_norm": 0.8299174308776855, + "kl": 0.117919921875, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 241280527.0, + "reward": 1.3687500953674316, + "reward_std": 0.2414630949497223, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36875003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.40576502680778503, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 439.45538330078125, + "completions/mean_terminated_length": 439.45538330078125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.1279339695640958, + "grad_norm": 0.8200660347938538, + "kl": 0.1226806640625, + "learning_rate": 1e-06, + "loss": 0.0451, + "num_tokens": 241400677.0, + "reward": 1.559821605682373, + "reward_std": 0.18447676301002502, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5687499642372131, + "rewards/curriculum_aware_reward_fn/std": 0.44337591528892517, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1236.0, + "completions/max_terminated_length": 1236.0, + "completions/mean_length": 442.3482360839844, + "completions/mean_terminated_length": 442.3482360839844, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 2.1289656951250966, + "grad_norm": 0.7119675874710083, + "kl": 0.1207275390625, + "learning_rate": 1e-06, + "loss": 0.0318, + "num_tokens": 241512618.0, + "reward": 1.4754464626312256, + "reward_std": 0.16743336617946625, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4754464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.4168195426464081, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3617.0, + "completions/max_terminated_length": 3617.0, + "completions/mean_length": 447.7232360839844, + "completions/mean_terminated_length": 447.7232360839844, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 2.1299974206860974, + "grad_norm": 0.7517739534378052, + "kl": 0.1259765625, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 241629704.0, + "reward": 1.4156250953674316, + "reward_std": 0.2064957469701767, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41562503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.4368721544742584, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 428.2232360839844, + "completions/mean_terminated_length": 428.2232360839844, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 2.1310291462470983, + "grad_norm": 0.8079065084457397, + "kl": 0.1055908203125, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 241745803.0, + "reward": 1.557142972946167, + "reward_std": 0.23306429386138916, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5571428537368774, + "rewards/curriculum_aware_reward_fn/std": 0.4445291757583618, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1499.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 421.27679443359375, + "completions/mean_terminated_length": 421.27679443359375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.132060871808099, + "grad_norm": 0.6699889898300171, + "kl": 0.110595703125, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 241848482.0, + "reward": 1.5254465341567993, + "reward_std": 0.1498248130083084, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098, + "rewards/curriculum_aware_reward_fn/std": 0.4337140917778015, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 399.4910888671875, + "completions/mean_terminated_length": 399.4910888671875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.1330925973691, + "grad_norm": 0.8710476756095886, + "kl": 0.1302490234375, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 241960292.0, + "reward": 1.5870537757873535, + "reward_std": 0.24912148714065552, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5870535969734192, + "rewards/curriculum_aware_reward_fn/std": 0.40077322721481323, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1508.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 426.8839416503906, + "completions/mean_terminated_length": 426.8839416503906, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 2.134124322930101, + "grad_norm": 0.8446362018585205, + "kl": 0.1260986328125, + "learning_rate": 1e-06, + "loss": -0.0197, + "num_tokens": 242076640.0, + "reward": 1.3781250715255737, + "reward_std": 0.23388829827308655, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37812498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.409062922000885, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1901.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 467.107177734375, + "completions/mean_terminated_length": 467.107177734375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 2.135156048491101, + "grad_norm": 0.7577447891235352, + "kl": 0.1160888671875, + "learning_rate": 1e-06, + "loss": 0.0406, + "num_tokens": 242198655.0, + "reward": 1.422767996788025, + "reward_std": 0.17161372303962708, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43169641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.4034197926521301, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 400.39288330078125, + "completions/mean_terminated_length": 400.39288330078125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 2.136187774052102, + "grad_norm": 0.7759943008422852, + "kl": 0.122314453125, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 242318343.0, + "reward": 1.6142858266830444, + "reward_std": 0.18312881886959076, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6142857670783997, + "rewards/curriculum_aware_reward_fn/std": 0.43409988284111023, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1280.0, + "completions/max_terminated_length": 1280.0, + "completions/mean_length": 456.2857360839844, + "completions/mean_terminated_length": 456.2857360839844, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 2.137219499613103, + "grad_norm": 0.6259797811508179, + "kl": 0.1077880859375, + "learning_rate": 1e-06, + "loss": -0.0289, + "num_tokens": 242438831.0, + "reward": 1.5111607313156128, + "reward_std": 0.22564736008644104, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5111607313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4776872992515564, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 411.9375305175781, + "completions/mean_terminated_length": 411.9375305175781, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 2.1382512251741037, + "grad_norm": 0.7962878942489624, + "kl": 0.1103515625, + "learning_rate": 1e-06, + "loss": -0.0324, + "num_tokens": 242547520.0, + "reward": 1.6008931398391724, + "reward_std": 0.29236966371536255, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6098214387893677, + "rewards/curriculum_aware_reward_fn/std": 0.4369138181209564, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1229.0, + "completions/max_terminated_length": 1229.0, + "completions/mean_length": 428.51788330078125, + "completions/mean_terminated_length": 428.51788330078125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 2.1392829507351045, + "grad_norm": 0.7675960063934326, + "kl": 0.126220703125, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 242670449.0, + "reward": 1.4281251430511475, + "reward_std": 0.2562326490879059, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42812496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.44386619329452515, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 410.9910888671875, + "completions/mean_terminated_length": 410.9910888671875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.1403146762961054, + "grad_norm": 0.8732093572616577, + "kl": 0.1181640625, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 242791037.0, + "reward": 1.5017858743667603, + "reward_std": 0.27640336751937866, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5017856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.4054209887981415, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1113.0, + "completions/max_terminated_length": 1113.0, + "completions/mean_length": 438.7232360839844, + "completions/mean_terminated_length": 438.7232360839844, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.1413464018571062, + "grad_norm": 0.7985749244689941, + "kl": 0.133056640625, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 242901368.0, + "reward": 1.5656250715255737, + "reward_std": 0.1947113573551178, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.565625011920929, + "rewards/curriculum_aware_reward_fn/std": 0.41557687520980835, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 455.5625305175781, + "completions/mean_terminated_length": 455.5625305175781, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 2.1423781274181066, + "grad_norm": 0.7833972573280334, + "kl": 0.1212158203125, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 243015321.0, + "reward": 1.4928572177886963, + "reward_std": 0.22096051275730133, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4928571283817291, + "rewards/curriculum_aware_reward_fn/std": 0.3814893066883087, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1843.0, + "completions/max_terminated_length": 1843.0, + "completions/mean_length": 443.20538330078125, + "completions/mean_terminated_length": 443.20538330078125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.1434098529791075, + "grad_norm": 0.8090054392814636, + "kl": 0.1243896484375, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 243135618.0, + "reward": 1.3790180683135986, + "reward_std": 0.19926463067531586, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3790178596973419, + "rewards/curriculum_aware_reward_fn/std": 0.3928779363632202, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3311.0, + "completions/max_terminated_length": 3311.0, + "completions/mean_length": 471.6339416503906, + "completions/mean_terminated_length": 471.6339416503906, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 2.1444415785401083, + "grad_norm": 0.9836584329605103, + "kl": 0.15234375, + "learning_rate": 1e-06, + "loss": 0.0335, + "num_tokens": 243262704.0, + "reward": 1.5544644594192505, + "reward_std": 0.2018306702375412, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5544642806053162, + "rewards/curriculum_aware_reward_fn/std": 0.3974328339099884, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1575.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 479.8660888671875, + "completions/mean_terminated_length": 479.8660888671875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.145473304101109, + "grad_norm": 0.7660709023475647, + "kl": 0.1162109375, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 243380555.0, + "reward": 1.4000000953674316, + "reward_std": 0.2146679162979126, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4000000059604645, + "rewards/curriculum_aware_reward_fn/std": 0.4533161222934723, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1415.0, + "completions/max_terminated_length": 1415.0, + "completions/mean_length": 443.96429443359375, + "completions/mean_terminated_length": 443.96429443359375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.14650502966211, + "grad_norm": 0.7633886337280273, + "kl": 0.1260986328125, + "learning_rate": 1e-06, + "loss": 0.0189, + "num_tokens": 243496030.0, + "reward": 1.364285945892334, + "reward_std": 0.1707383245229721, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3642857074737549, + "rewards/curriculum_aware_reward_fn/std": 0.3768903613090515, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1422.0, + "completions/max_terminated_length": 1422.0, + "completions/mean_length": 437.4196472167969, + "completions/mean_terminated_length": 437.4196472167969, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.147536755223111, + "grad_norm": 0.5623828172683716, + "kl": 0.134521484375, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 243611199.0, + "reward": 1.46473228931427, + "reward_std": 0.17655536532402039, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4825893044471741, + "rewards/curriculum_aware_reward_fn/std": 0.46936365962028503, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3209.0, + "completions/max_terminated_length": 3209.0, + "completions/mean_length": 473.1607360839844, + "completions/mean_terminated_length": 473.1607360839844, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.1485684807841112, + "grad_norm": 0.5925241112709045, + "kl": 0.12158203125, + "learning_rate": 1e-06, + "loss": 0.0526, + "num_tokens": 243732247.0, + "reward": 1.4468752145767212, + "reward_std": 0.1427421271800995, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.45580360293388367, + "rewards/curriculum_aware_reward_fn/std": 0.40955621004104614, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2060.0, + "completions/max_terminated_length": 2060.0, + "completions/mean_length": 524.8125, + "completions/mean_terminated_length": 524.8125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 2.149600206345112, + "grad_norm": 0.5460736751556396, + "kl": 0.120849609375, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 243856651.0, + "reward": 1.4629465341567993, + "reward_std": 0.17079660296440125, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.47187498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.45543670654296875, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 457.45538330078125, + "completions/mean_terminated_length": 457.45538330078125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 2.150631931906113, + "grad_norm": 0.8701239824295044, + "kl": 0.134765625, + "learning_rate": 1e-06, + "loss": -0.0452, + "num_tokens": 243983976.0, + "reward": 1.4397321939468384, + "reward_std": 0.20743878185749054, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4397321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.39158594608306885, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1926.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 525.6607666015625, + "completions/mean_terminated_length": 525.6607666015625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.1516636574671137, + "grad_norm": 0.616726815700531, + "kl": 0.1109619140625, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 244113900.0, + "reward": 1.4772322177886963, + "reward_std": 0.14241279661655426, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4772321581840515, + "rewards/curriculum_aware_reward_fn/std": 0.4339161813259125, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1227.0, + "completions/max_terminated_length": 1227.0, + "completions/mean_length": 480.58038330078125, + "completions/mean_terminated_length": 480.58038330078125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 2.1526953830281146, + "grad_norm": 0.561264157295227, + "kl": 0.1026611328125, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 244247959.0, + "reward": 1.6464285850524902, + "reward_std": 0.11487598717212677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6464285254478455, + "rewards/curriculum_aware_reward_fn/std": 0.43010130524635315, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1466.0, + "completions/mean_length": 542.1160888671875, + "completions/mean_terminated_length": 477.5, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 2.1537271085891154, + "grad_norm": 0.6171847581863403, + "kl": 0.115966796875, + "learning_rate": 1e-06, + "loss": 0.0731, + "num_tokens": 244379593.0, + "reward": 1.3107144832611084, + "reward_std": 0.21698839962482452, + "rewards/code_format_reward/mean": 0.9642857313156128, + "rewards/code_format_reward/std": 0.18641091883182526, + "rewards/curriculum_aware_reward_fn/mean": 0.34642860293388367, + "rewards/curriculum_aware_reward_fn/std": 0.41196706891059875, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 444.169677734375, + "completions/mean_terminated_length": 444.169677734375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.1547588341501163, + "grad_norm": 0.7611954808235168, + "kl": 0.1361083984375, + "learning_rate": 1e-06, + "loss": -0.0192, + "num_tokens": 244496302.0, + "reward": 1.6562501192092896, + "reward_std": 0.2018815129995346, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.65625, + "rewards/curriculum_aware_reward_fn/std": 0.5501484274864197, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1449.0, + "completions/max_terminated_length": 1449.0, + "completions/mean_length": 443.1250305175781, + "completions/mean_terminated_length": 443.1250305175781, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.1557905597111167, + "grad_norm": 0.6869838833808899, + "kl": 0.131103515625, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 244612894.0, + "reward": 1.5941966772079468, + "reward_std": 0.1067938357591629, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5941963791847229, + "rewards/curriculum_aware_reward_fn/std": 0.41125744581222534, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 521.2857666015625, + "completions/mean_terminated_length": 456.2908935546875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.1568222852721175, + "grad_norm": 0.643674910068512, + "kl": 0.1141357421875, + "learning_rate": 1e-06, + "loss": 0.0695, + "num_tokens": 244733998.0, + "reward": 1.3696428537368774, + "reward_std": 0.24212218821048737, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.38750001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.42651426792144775, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1299.0, + "completions/mean_length": 517.8035888671875, + "completions/mean_terminated_length": 485.56756591796875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 2.1578540108331183, + "grad_norm": 0.6695377230644226, + "kl": 0.123046875, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 244863798.0, + "reward": 1.412500023841858, + "reward_std": 0.2101779282093048, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.42142853140830994, + "rewards/curriculum_aware_reward_fn/std": 0.446140319108963, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1477.0, + "completions/max_terminated_length": 1477.0, + "completions/mean_length": 503.0535888671875, + "completions/mean_terminated_length": 503.0535888671875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.158885736394119, + "grad_norm": 0.7329318523406982, + "kl": 0.1243896484375, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 244984792.0, + "reward": 1.4683037996292114, + "reward_std": 0.25523483753204346, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4772321581840515, + "rewards/curriculum_aware_reward_fn/std": 0.4468553066253662, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1368.0, + "completions/mean_length": 537.0625, + "completions/mean_terminated_length": 505.0, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 2.15991746195512, + "grad_norm": 0.8445482850074768, + "kl": 0.137939453125, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 245118282.0, + "reward": 1.4089287519454956, + "reward_std": 0.2673889100551605, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41785717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.38630014657974243, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 474.1785888671875, + "completions/mean_terminated_length": 474.1785888671875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.160949187516121, + "grad_norm": 0.693393349647522, + "kl": 0.1282958984375, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 245231415.0, + "reward": 1.4066966772079468, + "reward_std": 0.15930292010307312, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4066964089870453, + "rewards/curriculum_aware_reward_fn/std": 0.4089370667934418, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1649.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 474.95538330078125, + "completions/mean_terminated_length": 474.95538330078125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 2.1619809130771213, + "grad_norm": 0.7273092865943909, + "kl": 0.113037109375, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 245348590.0, + "reward": 1.5553573369979858, + "reward_std": 0.25116923451423645, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5553571581840515, + "rewards/curriculum_aware_reward_fn/std": 0.4208719730377197, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1380.0, + "completions/max_terminated_length": 1380.0, + "completions/mean_length": 487.0357360839844, + "completions/mean_terminated_length": 487.0357360839844, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.163012638638122, + "grad_norm": 0.6591194868087769, + "kl": 0.1126708984375, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 245483337.0, + "reward": 1.5866073369979858, + "reward_std": 0.13034969568252563, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5866071581840515, + "rewards/curriculum_aware_reward_fn/std": 0.44255515933036804, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 494.1964416503906, + "completions/mean_terminated_length": 494.1964416503906, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 2.164044364199123, + "grad_norm": 0.6993364691734314, + "kl": 0.12255859375, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 245601939.0, + "reward": 1.4361608028411865, + "reward_std": 0.23765063285827637, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43616071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.41817978024482727, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 512.8660888671875, + "completions/mean_terminated_length": 512.8660888671875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.165076089760124, + "grad_norm": 0.7323153614997864, + "kl": 0.1158447265625, + "learning_rate": 1e-06, + "loss": -0.0304, + "num_tokens": 245740580.0, + "reward": 1.3687500953674316, + "reward_std": 0.22040800750255585, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3687500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.40808966755867004, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1176.0, + "completions/max_terminated_length": 1176.0, + "completions/mean_length": 474.8839416503906, + "completions/mean_terminated_length": 474.8839416503906, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 2.1661078153211246, + "grad_norm": 0.73459392786026, + "kl": 0.1187744140625, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 245861079.0, + "reward": 1.4508930444717407, + "reward_std": 0.17947648465633392, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4508928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4086333215236664, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1600.0, + "completions/max_terminated_length": 1600.0, + "completions/mean_length": 486.65179443359375, + "completions/mean_terminated_length": 486.65179443359375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 2.1671395408821255, + "grad_norm": 0.7303931713104248, + "kl": 0.1201171875, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 245970877.0, + "reward": 1.427232265472412, + "reward_std": 0.23644983768463135, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42723211646080017, + "rewards/curriculum_aware_reward_fn/std": 0.40630659461021423, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 442.6607360839844, + "completions/mean_terminated_length": 442.6607360839844, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.1681712664431263, + "grad_norm": 0.7605082988739014, + "kl": 0.1337890625, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 246084603.0, + "reward": 1.3745537996292114, + "reward_std": 0.16059215366840363, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37455353140830994, + "rewards/curriculum_aware_reward_fn/std": 0.37102216482162476, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3405.0, + "completions/max_terminated_length": 3405.0, + "completions/mean_length": 535.0714721679688, + "completions/mean_terminated_length": 535.0714721679688, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.1692029920041267, + "grad_norm": 0.6256927251815796, + "kl": 0.111328125, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 246209169.0, + "reward": 1.4223215579986572, + "reward_std": 0.18943563103675842, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42232146859169006, + "rewards/curriculum_aware_reward_fn/std": 0.4398934841156006, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 449.169677734375, + "completions/mean_terminated_length": 449.169677734375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 2.1702347175651275, + "grad_norm": 0.7536016702651978, + "kl": 0.1292724609375, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 246327936.0, + "reward": 1.5486608743667603, + "reward_std": 0.2521655261516571, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5486606955528259, + "rewards/curriculum_aware_reward_fn/std": 0.4142974019050598, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1337.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 524.0892944335938, + "completions/mean_terminated_length": 524.0892944335938, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.1712664431261284, + "grad_norm": 0.76837557554245, + "kl": 0.1265869140625, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 246455710.0, + "reward": 1.3549107313156128, + "reward_std": 0.24753983318805695, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3549107015132904, + "rewards/curriculum_aware_reward_fn/std": 0.40341779589653015, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1051.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 430.2232360839844, + "completions/mean_terminated_length": 430.2232360839844, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.172298168687129, + "grad_norm": 0.8480135202407837, + "kl": 0.1304931640625, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 246576317.0, + "reward": 1.5651787519454956, + "reward_std": 0.22769606113433838, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5651785731315613, + "rewards/curriculum_aware_reward_fn/std": 0.40829458832740784, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 496.15179443359375, + "completions/mean_terminated_length": 496.15179443359375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 2.17332989424813, + "grad_norm": 0.65647953748703, + "kl": 0.126220703125, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 246704833.0, + "reward": 1.4482142925262451, + "reward_std": 0.19437025487422943, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4571428596973419, + "rewards/curriculum_aware_reward_fn/std": 0.41600075364112854, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2709.0, + "completions/max_terminated_length": 2709.0, + "completions/mean_length": 457.607177734375, + "completions/mean_terminated_length": 457.607177734375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.174361619809131, + "grad_norm": 0.7182737588882446, + "kl": 0.1285400390625, + "learning_rate": 1e-06, + "loss": 0.0334, + "num_tokens": 246821801.0, + "reward": 1.4437501430511475, + "reward_std": 0.21361099183559418, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4437499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.42786526679992676, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 517.0803833007812, + "completions/mean_terminated_length": 517.0803833007812, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 2.1753933453701317, + "grad_norm": 0.7584192752838135, + "kl": 0.11572265625, + "learning_rate": 1e-06, + "loss": 0.0328, + "num_tokens": 246951289.0, + "reward": 1.4312500953674316, + "reward_std": 0.2715103030204773, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4491071403026581, + "rewards/curriculum_aware_reward_fn/std": 0.40879857540130615, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1060.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 518.375, + "completions/mean_terminated_length": 518.375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 2.176425070931132, + "grad_norm": 0.6757792830467224, + "kl": 0.107421875, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 247076533.0, + "reward": 1.4361608028411865, + "reward_std": 0.17304961383342743, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43616074323654175, + "rewards/curriculum_aware_reward_fn/std": 0.43429604172706604, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 433.7321472167969, + "completions/mean_terminated_length": 433.7321472167969, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 2.177456796492133, + "grad_norm": 0.6956561803817749, + "kl": 0.1221923828125, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 247189917.0, + "reward": 1.614732265472412, + "reward_std": 0.12834465503692627, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6147321462631226, + "rewards/curriculum_aware_reward_fn/std": 0.42270249128341675, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 495.4910888671875, + "completions/mean_terminated_length": 495.4910888671875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 2.178488522053134, + "grad_norm": 0.8153955340385437, + "kl": 0.1334228515625, + "learning_rate": 1e-06, + "loss": -0.0234, + "num_tokens": 247312822.0, + "reward": 1.3299108743667603, + "reward_std": 0.18136192858219147, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.33883926272392273, + "rewards/curriculum_aware_reward_fn/std": 0.386932373046875, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 502.8839416503906, + "completions/mean_terminated_length": 502.8839416503906, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 2.1795202476141347, + "grad_norm": 0.7842468023300171, + "kl": 0.112548828125, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 247432453.0, + "reward": 1.3258930444717407, + "reward_std": 0.27183881402015686, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3258928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3902518153190613, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1342.0, + "completions/mean_length": 565.5625, + "completions/mean_terminated_length": 501.3727111816406, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 2.1805519731751355, + "grad_norm": 0.6904152631759644, + "kl": 0.1268310546875, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 247576023.0, + "reward": 1.5044645071029663, + "reward_std": 0.1780475676059723, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5044642686843872, + "rewards/curriculum_aware_reward_fn/std": 0.5449420809745789, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1255.0, + "completions/max_terminated_length": 1255.0, + "completions/mean_length": 507.0357360839844, + "completions/mean_terminated_length": 507.0357360839844, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 2.1815836987361363, + "grad_norm": 0.7472612261772156, + "kl": 0.1280517578125, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 247695181.0, + "reward": 1.5468751192092896, + "reward_std": 0.16403816640377045, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.546875, + "rewards/curriculum_aware_reward_fn/std": 0.4169430434703827, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1153.0, + "completions/max_terminated_length": 1153.0, + "completions/mean_length": 506.9375305175781, + "completions/mean_terminated_length": 506.9375305175781, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 2.182615424297137, + "grad_norm": 0.7527168393135071, + "kl": 0.1275634765625, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 247822854.0, + "reward": 1.4482142925262451, + "reward_std": 0.13642320036888123, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4482142925262451, + "rewards/curriculum_aware_reward_fn/std": 0.4124705195426941, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 501.4910888671875, + "completions/mean_terminated_length": 501.4910888671875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 2.1836471498581376, + "grad_norm": 0.6995075345039368, + "kl": 0.1165771484375, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 247948531.0, + "reward": 1.3142857551574707, + "reward_std": 0.1445000171661377, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3142857253551483, + "rewards/curriculum_aware_reward_fn/std": 0.415164589881897, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1058.0, + "completions/max_terminated_length": 1058.0, + "completions/mean_length": 448.2589416503906, + "completions/mean_terminated_length": 448.2589416503906, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.1846788754191384, + "grad_norm": 0.62738037109375, + "kl": 0.12548828125, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 248052279.0, + "reward": 1.5205358266830444, + "reward_std": 0.10798604786396027, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5205356478691101, + "rewards/curriculum_aware_reward_fn/std": 0.43072080612182617, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2031.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 520.3482666015625, + "completions/mean_terminated_length": 520.3482666015625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 2.1857106009801393, + "grad_norm": 0.773405134677887, + "kl": 0.13330078125, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 248179977.0, + "reward": 1.5383931398391724, + "reward_std": 0.22789046168327332, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5473214387893677, + "rewards/curriculum_aware_reward_fn/std": 0.4031318426132202, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 1355.0, + "completions/mean_length": 497.71429443359375, + "completions/mean_terminated_length": 497.71429443359375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 2.18674232654114, + "grad_norm": 0.7330118417739868, + "kl": 0.1221923828125, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 248307100.0, + "reward": 1.3544644117355347, + "reward_std": 0.10890157520771027, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3544642925262451, + "rewards/curriculum_aware_reward_fn/std": 0.4141937494277954, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1394.0, + "completions/max_terminated_length": 1394.0, + "completions/mean_length": 497.3214416503906, + "completions/mean_terminated_length": 497.3214416503906, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.187774052102141, + "grad_norm": 0.6636401414871216, + "kl": 0.126953125, + "learning_rate": 1e-06, + "loss": -0.0144, + "num_tokens": 248424834.0, + "reward": 1.5687501430511475, + "reward_std": 0.2093847543001175, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5687499642372131, + "rewards/curriculum_aware_reward_fn/std": 0.44509968161582947, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1626.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 479.5714416503906, + "completions/mean_terminated_length": 479.5714416503906, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.1888057776631418, + "grad_norm": 0.7832713723182678, + "kl": 0.1363525390625, + "learning_rate": 1e-06, + "loss": 0.0214, + "num_tokens": 248551885.0, + "reward": 1.5040180683135986, + "reward_std": 0.21539589762687683, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579, + "rewards/curriculum_aware_reward_fn/std": 0.4540002942085266, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 505.40179443359375, + "completions/mean_terminated_length": 473.0540771484375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.189837503224142, + "grad_norm": 0.7848573327064514, + "kl": 0.12451171875, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 248674360.0, + "reward": 1.579464316368103, + "reward_std": 0.24824805557727814, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5794642567634583, + "rewards/curriculum_aware_reward_fn/std": 0.4694041609764099, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 471.5000305175781, + "completions/mean_terminated_length": 471.5000305175781, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 2.190869228785143, + "grad_norm": 0.8156066536903381, + "kl": 0.1309814453125, + "learning_rate": 1e-06, + "loss": 0.0317, + "num_tokens": 248793339.0, + "reward": 1.4714287519454956, + "reward_std": 0.2152327597141266, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4535999298095703, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1339.0, + "completions/max_terminated_length": 1339.0, + "completions/mean_length": 524.6607666015625, + "completions/mean_terminated_length": 524.6607666015625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 2.191900954346144, + "grad_norm": 0.7268944382667542, + "kl": 0.1322021484375, + "learning_rate": 1e-06, + "loss": 0.0159, + "num_tokens": 248913505.0, + "reward": 1.3598215579986572, + "reward_std": 0.18459539115428925, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35982146859169006, + "rewards/curriculum_aware_reward_fn/std": 0.3496999740600586, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1046.0, + "completions/max_terminated_length": 1046.0, + "completions/mean_length": 494.2589416503906, + "completions/mean_terminated_length": 494.2589416503906, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 2.1929326799071447, + "grad_norm": 0.7613234519958496, + "kl": 0.12744140625, + "learning_rate": 1e-06, + "loss": 0.0322, + "num_tokens": 249031162.0, + "reward": 1.587053656578064, + "reward_std": 0.18938103318214417, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5870535969734192, + "rewards/curriculum_aware_reward_fn/std": 0.4418809711933136, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 468.794677734375, + "completions/mean_terminated_length": 468.794677734375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.1939644054681455, + "grad_norm": 0.5832720994949341, + "kl": 0.1263427734375, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 249153464.0, + "reward": 1.4330357313156128, + "reward_std": 0.16529995203018188, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.4615657329559326, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1110.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 459.1875305175781, + "completions/mean_terminated_length": 459.1875305175781, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 2.1949961310291464, + "grad_norm": 0.8431884050369263, + "kl": 0.1591796875, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 249269776.0, + "reward": 1.4276785850524902, + "reward_std": 0.23011423647403717, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.3886904716491699, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1473.0, + "completions/max_terminated_length": 1473.0, + "completions/mean_length": 477.544677734375, + "completions/mean_terminated_length": 477.544677734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.196027856590147, + "grad_norm": 0.7806416153907776, + "kl": 0.131591796875, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 249375516.0, + "reward": 1.5459822416305542, + "reward_std": 0.24724382162094116, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5459821820259094, + "rewards/curriculum_aware_reward_fn/std": 0.4110598564147949, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1181.0, + "completions/max_terminated_length": 1181.0, + "completions/mean_length": 492.0714416503906, + "completions/mean_terminated_length": 492.0714416503906, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 2.1970595821511476, + "grad_norm": 0.6576789617538452, + "kl": 0.1273193359375, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 249502882.0, + "reward": 1.3633930683135986, + "reward_std": 0.20141588151454926, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3633928596973419, + "rewards/curriculum_aware_reward_fn/std": 0.39215174317359924, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2546.0, + "completions/max_terminated_length": 2546.0, + "completions/mean_length": 438.45538330078125, + "completions/mean_terminated_length": 438.45538330078125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 2.1980913077121484, + "grad_norm": 0.7464306950569153, + "kl": 0.135009765625, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 249616298.0, + "reward": 1.6316964626312256, + "reward_std": 0.1531204730272293, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6316964030265808, + "rewards/curriculum_aware_reward_fn/std": 0.3808475434780121, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1904.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 499.982177734375, + "completions/mean_terminated_length": 499.982177734375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 2.1991230332731493, + "grad_norm": 0.6995850801467896, + "kl": 0.12451171875, + "learning_rate": 1e-06, + "loss": -0.0219, + "num_tokens": 249740315.0, + "reward": 1.4102680683135986, + "reward_std": 0.11930114775896072, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41919639706611633, + "rewards/curriculum_aware_reward_fn/std": 0.3942860960960388, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1609.0, + "completions/max_terminated_length": 1609.0, + "completions/mean_length": 517.2232666015625, + "completions/mean_terminated_length": 517.2232666015625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.20015475883415, + "grad_norm": 0.8047731518745422, + "kl": 0.1268310546875, + "learning_rate": 1e-06, + "loss": 0.0307, + "num_tokens": 249867908.0, + "reward": 1.4915179014205933, + "reward_std": 0.198326975107193, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4915178418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4356773793697357, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1301.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 463.669677734375, + "completions/mean_terminated_length": 463.669677734375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 2.201186484395151, + "grad_norm": 0.6989313364028931, + "kl": 0.1282958984375, + "learning_rate": 1e-06, + "loss": 0.0222, + "num_tokens": 249983266.0, + "reward": 1.5669643878936768, + "reward_std": 0.19032026827335358, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5669642686843872, + "rewards/curriculum_aware_reward_fn/std": 0.42846643924713135, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1163.0, + "completions/max_terminated_length": 1163.0, + "completions/mean_length": 484.51788330078125, + "completions/mean_terminated_length": 484.51788330078125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.202218209956152, + "grad_norm": 0.843909740447998, + "kl": 0.128662109375, + "learning_rate": 1e-06, + "loss": 0.0428, + "num_tokens": 250114959.0, + "reward": 1.503571629524231, + "reward_std": 0.24994143843650818, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5035714507102966, + "rewards/curriculum_aware_reward_fn/std": 0.39322608709335327, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 428.2946472167969, + "completions/mean_terminated_length": 428.2946472167969, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.203249935517152, + "grad_norm": 0.7581227421760559, + "kl": 0.1234130859375, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 250227965.0, + "reward": 1.506250023841858, + "reward_std": 0.16399899125099182, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5062500238418579, + "rewards/curriculum_aware_reward_fn/std": 0.42288821935653687, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2759.0, + "completions/max_terminated_length": 2759.0, + "completions/mean_length": 402.4285888671875, + "completions/mean_terminated_length": 402.4285888671875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 2.204281661078153, + "grad_norm": 0.8055557608604431, + "kl": 0.1396484375, + "learning_rate": 1e-06, + "loss": -0.0158, + "num_tokens": 250329611.0, + "reward": 1.6191965341567993, + "reward_std": 0.16283194720745087, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6191964149475098, + "rewards/curriculum_aware_reward_fn/std": 0.4231076240539551, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1142.0, + "completions/max_terminated_length": 1142.0, + "completions/mean_length": 421.26788330078125, + "completions/mean_terminated_length": 421.26788330078125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 2.205313386639154, + "grad_norm": 0.7206082344055176, + "kl": 0.1309814453125, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 250444441.0, + "reward": 1.4982143640518188, + "reward_std": 0.19586418569087982, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.4536105692386627, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1214.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 429.1339416503906, + "completions/mean_terminated_length": 429.1339416503906, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.2063451122001547, + "grad_norm": 0.782223105430603, + "kl": 0.133056640625, + "learning_rate": 1e-06, + "loss": 0.0241, + "num_tokens": 250559550.0, + "reward": 1.3928571939468384, + "reward_std": 0.1982770413160324, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3928571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.42244765162467957, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1527.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 505.8482360839844, + "completions/mean_terminated_length": 505.8482360839844, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 2.2073768377611556, + "grad_norm": 0.6624898314476013, + "kl": 0.117919921875, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 250687731.0, + "reward": 1.3897321224212646, + "reward_std": 0.1644437313079834, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38973215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.35349851846694946, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1354.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 433.89288330078125, + "completions/mean_terminated_length": 433.89288330078125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 2.2084085633221564, + "grad_norm": 0.7602798342704773, + "kl": 0.14306640625, + "learning_rate": 1e-06, + "loss": 0.0429, + "num_tokens": 250807678.0, + "reward": 1.5544644594192505, + "reward_std": 0.20387950539588928, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5633928775787354, + "rewards/curriculum_aware_reward_fn/std": 0.3997173607349396, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 518.857177734375, + "completions/mean_terminated_length": 486.6306457519531, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 2.2094402888831572, + "grad_norm": 0.7733542323112488, + "kl": 0.1219482421875, + "learning_rate": 1e-06, + "loss": 0.0383, + "num_tokens": 250937253.0, + "reward": 1.4982143640518188, + "reward_std": 0.21743346750736237, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4982143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3919762969017029, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 446.15179443359375, + "completions/mean_terminated_length": 446.15179443359375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 2.210472014444158, + "grad_norm": 0.6671974658966064, + "kl": 0.11962890625, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 251060862.0, + "reward": 1.5214287042617798, + "reward_std": 0.14280013740062714, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5214285850524902, + "rewards/curriculum_aware_reward_fn/std": 0.44472455978393555, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 456.5535888671875, + "completions/mean_terminated_length": 456.5535888671875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 2.2115037400051585, + "grad_norm": 0.775130033493042, + "kl": 0.1229248046875, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 251177817.0, + "reward": 1.2549108266830444, + "reward_std": 0.15199097990989685, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.25491073727607727, + "rewards/curriculum_aware_reward_fn/std": 0.31192225217819214, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 431.52679443359375, + "completions/mean_terminated_length": 431.52679443359375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 2.2125354655661593, + "grad_norm": 0.7285803556442261, + "kl": 0.12939453125, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 251297632.0, + "reward": 1.3763394355773926, + "reward_std": 0.15684416890144348, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37633928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4005041718482971, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1134.0, + "completions/max_terminated_length": 1134.0, + "completions/mean_length": 439.3482360839844, + "completions/mean_terminated_length": 439.3482360839844, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 2.21356719112716, + "grad_norm": 0.7767670750617981, + "kl": 0.111328125, + "learning_rate": 1e-06, + "loss": -0.0239, + "num_tokens": 251408180.0, + "reward": 1.6044644117355347, + "reward_std": 0.17807038128376007, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6044642329216003, + "rewards/curriculum_aware_reward_fn/std": 0.4751269817352295, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 449.3214416503906, + "completions/mean_terminated_length": 449.3214416503906, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.214598916688161, + "grad_norm": 0.5723203420639038, + "kl": 0.107421875, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 251520461.0, + "reward": 1.5924108028411865, + "reward_std": 0.1818428784608841, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.592410683631897, + "rewards/curriculum_aware_reward_fn/std": 0.45292890071868896, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 425.2857360839844, + "completions/mean_terminated_length": 425.2857360839844, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 2.215630642249162, + "grad_norm": 0.7234082221984863, + "kl": 0.111083984375, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 251623372.0, + "reward": 1.5941965579986572, + "reward_std": 0.2118162214756012, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5941964387893677, + "rewards/curriculum_aware_reward_fn/std": 0.4257872998714447, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1098.0, + "completions/max_terminated_length": 1098.0, + "completions/mean_length": 459.5357360839844, + "completions/mean_terminated_length": 459.5357360839844, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.2166623678101627, + "grad_norm": 0.8283867835998535, + "kl": 0.119384765625, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 251744980.0, + "reward": 1.4450894594192505, + "reward_std": 0.19101282954216003, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.40614426136016846, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 435.70538330078125, + "completions/mean_terminated_length": 435.70538330078125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 2.217694093371163, + "grad_norm": 0.7950035929679871, + "kl": 0.13330078125, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 251868350.0, + "reward": 1.3888393640518188, + "reward_std": 0.1650308221578598, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3977678716182709, + "rewards/curriculum_aware_reward_fn/std": 0.4264027178287506, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1430.0, + "completions/max_terminated_length": 1430.0, + "completions/mean_length": 428.0357360839844, + "completions/mean_terminated_length": 428.0357360839844, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 2.218725818932164, + "grad_norm": 0.8781709671020508, + "kl": 0.1246337890625, + "learning_rate": 1e-06, + "loss": -0.0176, + "num_tokens": 251989559.0, + "reward": 1.3375002145767212, + "reward_std": 0.1628822684288025, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33750003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.36052390933036804, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 433.7321472167969, + "completions/mean_terminated_length": 433.7321472167969, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.2197575444931648, + "grad_norm": 0.6704356074333191, + "kl": 0.1156005859375, + "learning_rate": 1e-06, + "loss": 0.0339, + "num_tokens": 252100024.0, + "reward": 1.4651787281036377, + "reward_std": 0.16542240977287292, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46517857909202576, + "rewards/curriculum_aware_reward_fn/std": 0.46128684282302856, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 434.4107360839844, + "completions/mean_terminated_length": 434.4107360839844, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.2207892700541656, + "grad_norm": 0.7110837697982788, + "kl": 0.124267578125, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 252218531.0, + "reward": 1.5674108266830444, + "reward_std": 0.20352497696876526, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5763392448425293, + "rewards/curriculum_aware_reward_fn/std": 0.5065765380859375, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1754.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 461.52679443359375, + "completions/mean_terminated_length": 461.52679443359375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.2218209956151664, + "grad_norm": 0.7591835856437683, + "kl": 0.1285400390625, + "learning_rate": 1e-06, + "loss": -0.0204, + "num_tokens": 252339312.0, + "reward": 1.4964287281036377, + "reward_std": 0.20607492327690125, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49642854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.42620861530303955, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 406.9375305175781, + "completions/mean_terminated_length": 406.9375305175781, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 2.2228527211761673, + "grad_norm": 0.8121379017829895, + "kl": 0.1046142578125, + "learning_rate": 1e-06, + "loss": 0.0271, + "num_tokens": 252447497.0, + "reward": 1.6383929252624512, + "reward_std": 0.19473080337047577, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6383928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.38291096687316895, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 403.9375305175781, + "completions/mean_terminated_length": 403.9375305175781, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 2.223884446737168, + "grad_norm": 0.7377510666847229, + "kl": 0.11181640625, + "learning_rate": 1e-06, + "loss": 0.0388, + "num_tokens": 252547779.0, + "reward": 1.697767972946167, + "reward_std": 0.22175738215446472, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6977678537368774, + "rewards/curriculum_aware_reward_fn/std": 0.4093715250492096, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 405.4821472167969, + "completions/mean_terminated_length": 405.4821472167969, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 2.2249161722981685, + "grad_norm": 0.7825002074241638, + "kl": 0.116455078125, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 252653400.0, + "reward": 1.5647321939468384, + "reward_std": 0.14512257277965546, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5647321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.44263941049575806, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 481.6875305175781, + "completions/mean_terminated_length": 481.6875305175781, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 2.2259478978591694, + "grad_norm": 0.7722179293632507, + "kl": 0.1181640625, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 252777111.0, + "reward": 1.4276787042617798, + "reward_std": 0.19991116225719452, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42767855525016785, + "rewards/curriculum_aware_reward_fn/std": 0.4300030767917633, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 407.8660888671875, + "completions/mean_terminated_length": 407.8660888671875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 2.22697962342017, + "grad_norm": 0.6580060720443726, + "kl": 0.1263427734375, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 252883650.0, + "reward": 1.6477681398391724, + "reward_std": 0.15886934101581573, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6477679014205933, + "rewards/curriculum_aware_reward_fn/std": 0.4077177047729492, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1382.0, + "completions/max_terminated_length": 1382.0, + "completions/mean_length": 431.46429443359375, + "completions/mean_terminated_length": 431.46429443359375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.228011348981171, + "grad_norm": 0.7175585627555847, + "kl": 0.1387939453125, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 253004984.0, + "reward": 1.4406250715255737, + "reward_std": 0.21226482093334198, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.4401318430900574, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 439.4107360839844, + "completions/mean_terminated_length": 439.4107360839844, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 2.229043074542172, + "grad_norm": 0.6940580010414124, + "kl": 0.114501953125, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 253120782.0, + "reward": 1.5446430444717407, + "reward_std": 0.16438443958759308, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5446428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.44084057211875916, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 420.9375305175781, + "completions/mean_terminated_length": 420.9375305175781, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.2300748001031727, + "grad_norm": 0.9348301291465759, + "kl": 0.1226806640625, + "learning_rate": 1e-06, + "loss": -0.0145, + "num_tokens": 253236324.0, + "reward": 1.5491071939468384, + "reward_std": 0.2583702802658081, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5491071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4151403605937958, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 459.357177734375, + "completions/mean_terminated_length": 459.357177734375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 2.231106525664173, + "grad_norm": 0.7177847623825073, + "kl": 0.11181640625, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 253359088.0, + "reward": 1.4714287519454956, + "reward_std": 0.16225051879882812, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4265255630016327, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1367.0, + "completions/max_terminated_length": 1367.0, + "completions/mean_length": 503.107177734375, + "completions/mean_terminated_length": 503.107177734375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 2.232138251225174, + "grad_norm": 0.6663182973861694, + "kl": 0.1199951171875, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 253485198.0, + "reward": 1.4745537042617798, + "reward_std": 0.14503777027130127, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47455355525016785, + "rewards/curriculum_aware_reward_fn/std": 0.4369218349456787, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 443.3214416503906, + "completions/mean_terminated_length": 443.3214416503906, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 2.233169976786175, + "grad_norm": 0.6235113739967346, + "kl": 0.1243896484375, + "learning_rate": 1e-06, + "loss": -0.0142, + "num_tokens": 253607300.0, + "reward": 1.5187500715255737, + "reward_std": 0.14643462002277374, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.518750011920929, + "rewards/curriculum_aware_reward_fn/std": 0.43903684616088867, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1213.0, + "completions/max_terminated_length": 1213.0, + "completions/mean_length": 445.9107360839844, + "completions/mean_terminated_length": 445.9107360839844, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 2.2342017023471756, + "grad_norm": 0.8566228151321411, + "kl": 0.124267578125, + "learning_rate": 1e-06, + "loss": 0.0479, + "num_tokens": 253725534.0, + "reward": 1.519196629524231, + "reward_std": 0.2857770025730133, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5191964507102966, + "rewards/curriculum_aware_reward_fn/std": 0.4281347095966339, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 487.0625305175781, + "completions/mean_terminated_length": 487.0625305175781, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.2352334279081765, + "grad_norm": 0.623132050037384, + "kl": 0.1077880859375, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 253851527.0, + "reward": 1.3781250715255737, + "reward_std": 0.15566569566726685, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.38705354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.4219082295894623, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1244.0, + "completions/max_terminated_length": 1244.0, + "completions/mean_length": 507.3214416503906, + "completions/mean_terminated_length": 507.3214416503906, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 2.2362651534691773, + "grad_norm": 0.7437611222267151, + "kl": 0.121826171875, + "learning_rate": 1e-06, + "loss": -0.0311, + "num_tokens": 253976660.0, + "reward": 1.4370537996292114, + "reward_std": 0.24500703811645508, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43705353140830994, + "rewards/curriculum_aware_reward_fn/std": 0.40977612137794495, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 424.27679443359375, + "completions/mean_terminated_length": 424.27679443359375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 2.237296879030178, + "grad_norm": 0.7895928025245667, + "kl": 0.129150390625, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 254094991.0, + "reward": 1.469642996788025, + "reward_std": 0.14035160839557648, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296, + "rewards/curriculum_aware_reward_fn/std": 0.37622812390327454, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 397.4285888671875, + "completions/mean_terminated_length": 397.4285888671875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 2.2383286045911786, + "grad_norm": 0.8234912753105164, + "kl": 0.1220703125, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 254208824.0, + "reward": 1.5513393878936768, + "reward_std": 0.23913827538490295, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5513392686843872, + "rewards/curriculum_aware_reward_fn/std": 0.45665547251701355, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1095.0, + "completions/max_terminated_length": 1095.0, + "completions/mean_length": 444.4375305175781, + "completions/mean_terminated_length": 444.4375305175781, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 2.2393603301521794, + "grad_norm": 0.7604856491088867, + "kl": 0.110107421875, + "learning_rate": 1e-06, + "loss": 0.0189, + "num_tokens": 254331324.0, + "reward": 1.5223214626312256, + "reward_std": 0.24196383357048035, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5223214030265808, + "rewards/curriculum_aware_reward_fn/std": 0.44816136360168457, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 463.8482360839844, + "completions/mean_terminated_length": 463.8482360839844, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 2.2403920557131802, + "grad_norm": 0.7959080934524536, + "kl": 0.127197265625, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 254455319.0, + "reward": 1.4455358982086182, + "reward_std": 0.1944299191236496, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44553568959236145, + "rewards/curriculum_aware_reward_fn/std": 0.4174436330795288, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1652.0, + "completions/max_terminated_length": 1652.0, + "completions/mean_length": 456.857177734375, + "completions/mean_terminated_length": 456.857177734375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 2.241423781274181, + "grad_norm": 0.7582790851593018, + "kl": 0.1181640625, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 254581583.0, + "reward": 1.4687501192092896, + "reward_std": 0.2497893124818802, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46875, + "rewards/curriculum_aware_reward_fn/std": 0.43184730410575867, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1748.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 523.8482666015625, + "completions/mean_terminated_length": 523.8482666015625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 2.242455506835182, + "grad_norm": 0.6578783988952637, + "kl": 0.112548828125, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 254707776.0, + "reward": 1.4723217487335205, + "reward_std": 0.20645909011363983, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.48125001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.4099617898464203, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1240.0, + "completions/max_terminated_length": 1240.0, + "completions/mean_length": 471.40179443359375, + "completions/mean_terminated_length": 471.40179443359375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 2.2434872323961828, + "grad_norm": 0.6668171286582947, + "kl": 0.1129150390625, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 254822362.0, + "reward": 1.4473215341567993, + "reward_std": 0.12001083791255951, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44732141494750977, + "rewards/curriculum_aware_reward_fn/std": 0.4498167335987091, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 433.3482360839844, + "completions/mean_terminated_length": 433.3482360839844, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 2.2445189579571836, + "grad_norm": 0.7935535907745361, + "kl": 0.130615234375, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 254932119.0, + "reward": 1.4691966772079468, + "reward_std": 0.15878739953041077, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4691964089870453, + "rewards/curriculum_aware_reward_fn/std": 0.4048270583152771, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 403.6785888671875, + "completions/mean_terminated_length": 403.6785888671875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 2.245550683518184, + "grad_norm": 0.86411452293396, + "kl": 0.1263427734375, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 255045637.0, + "reward": 1.6334823369979858, + "reward_std": 0.22985121607780457, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6334820985794067, + "rewards/curriculum_aware_reward_fn/std": 0.42966508865356445, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 443.1339416503906, + "completions/mean_terminated_length": 443.1339416503906, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 2.246582409079185, + "grad_norm": 0.7280702590942383, + "kl": 0.1129150390625, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 255167816.0, + "reward": 1.469642996788025, + "reward_std": 0.18711194396018982, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296, + "rewards/curriculum_aware_reward_fn/std": 0.42460137605667114, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 422.14288330078125, + "completions/mean_terminated_length": 422.14288330078125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 2.2476141346401857, + "grad_norm": 0.7460468411445618, + "kl": 0.1173095703125, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 255278390.0, + "reward": 1.6116071939468384, + "reward_std": 0.16963741183280945, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6116071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.42058423161506653, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1267.0, + "completions/max_terminated_length": 1267.0, + "completions/mean_length": 414.2410888671875, + "completions/mean_terminated_length": 414.2410888671875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 2.2486458602011865, + "grad_norm": 0.7804027795791626, + "kl": 0.1292724609375, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 255389374.0, + "reward": 1.5683037042617798, + "reward_std": 0.23866964876651764, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5772321820259094, + "rewards/curriculum_aware_reward_fn/std": 0.42584776878356934, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 413.4107360839844, + "completions/mean_terminated_length": 413.4107360839844, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.2496775857621873, + "grad_norm": 0.8622057437896729, + "kl": 0.124267578125, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 255506852.0, + "reward": 1.557142972946167, + "reward_std": 0.22154921293258667, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5571428537368774, + "rewards/curriculum_aware_reward_fn/std": 0.4266915023326874, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 435.1964416503906, + "completions/mean_terminated_length": 435.1964416503906, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.250709311323188, + "grad_norm": 0.7885429859161377, + "kl": 0.1177978515625, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 255615038.0, + "reward": 1.4687501192092896, + "reward_std": 0.24748605489730835, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4866071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.40752947330474854, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 408.9196472167969, + "completions/mean_terminated_length": 408.9196472167969, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 2.251741036884189, + "grad_norm": 0.655225932598114, + "kl": 0.1143798828125, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 255736395.0, + "reward": 1.4709821939468384, + "reward_std": 0.16663986444473267, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4799107611179352, + "rewards/curriculum_aware_reward_fn/std": 0.43493086099624634, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1210.0, + "completions/max_terminated_length": 1210.0, + "completions/mean_length": 468.01788330078125, + "completions/mean_terminated_length": 468.01788330078125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 2.2527727624451894, + "grad_norm": 0.6557589769363403, + "kl": 0.12060546875, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 255866037.0, + "reward": 1.3339285850524902, + "reward_std": 0.17304901778697968, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.34285715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.4116467535495758, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 448.8214416503906, + "completions/mean_terminated_length": 448.8214416503906, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.2538044880061903, + "grad_norm": 0.7125776410102844, + "kl": 0.1163330078125, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 255976505.0, + "reward": 1.4607144594192505, + "reward_std": 0.23142556846141815, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.46964284777641296, + "rewards/curriculum_aware_reward_fn/std": 0.43704357743263245, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1157.0, + "completions/max_terminated_length": 1157.0, + "completions/mean_length": 487.7232360839844, + "completions/mean_terminated_length": 487.7232360839844, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 2.254836213567191, + "grad_norm": 0.7252135872840881, + "kl": 0.1541748046875, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 256106863.0, + "reward": 1.5406252145767212, + "reward_std": 0.25469622015953064, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5406250357627869, + "rewards/curriculum_aware_reward_fn/std": 0.4111361503601074, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 450.482177734375, + "completions/mean_terminated_length": 450.482177734375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.255867939128192, + "grad_norm": 0.6247652173042297, + "kl": 0.1253662109375, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 256232098.0, + "reward": 1.5013394355773926, + "reward_std": 0.15615127980709076, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.501339316368103, + "rewards/curriculum_aware_reward_fn/std": 0.4447116255760193, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1319.0, + "completions/max_terminated_length": 1319.0, + "completions/mean_length": 478.8482360839844, + "completions/mean_terminated_length": 478.8482360839844, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.256899664689193, + "grad_norm": 0.7280486226081848, + "kl": 0.1190185546875, + "learning_rate": 1e-06, + "loss": 0.0309, + "num_tokens": 256359453.0, + "reward": 1.4285714626312256, + "reward_std": 0.183299720287323, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4285714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.37088367342948914, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 434.6160888671875, + "completions/mean_terminated_length": 434.6160888671875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.2579313902501936, + "grad_norm": 0.6480599045753479, + "kl": 0.1168212890625, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 256469289.0, + "reward": 1.4513393640518188, + "reward_std": 0.16868485510349274, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4513393044471741, + "rewards/curriculum_aware_reward_fn/std": 0.45074811577796936, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1130.0, + "completions/max_terminated_length": 1130.0, + "completions/mean_length": 455.71429443359375, + "completions/mean_terminated_length": 455.71429443359375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 2.258963115811194, + "grad_norm": 0.8033361434936523, + "kl": 0.12060546875, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 256583854.0, + "reward": 1.6004464626312256, + "reward_std": 0.21607355773448944, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6004464030265808, + "rewards/curriculum_aware_reward_fn/std": 0.38152700662612915, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1203.0, + "completions/max_terminated_length": 1203.0, + "completions/mean_length": 471.0535888671875, + "completions/mean_terminated_length": 471.0535888671875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 2.259994841372195, + "grad_norm": 0.7653167843818665, + "kl": 0.122314453125, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 256713283.0, + "reward": 1.4058037996292114, + "reward_std": 0.23669905960559845, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40580353140830994, + "rewards/curriculum_aware_reward_fn/std": 0.410709410905838, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 424.14288330078125, + "completions/mean_terminated_length": 424.14288330078125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 2.2610265669331957, + "grad_norm": 0.8268042206764221, + "kl": 0.131103515625, + "learning_rate": 1e-06, + "loss": 0.0579, + "num_tokens": 256818628.0, + "reward": 1.549553632736206, + "reward_std": 0.23739364743232727, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5495535731315613, + "rewards/curriculum_aware_reward_fn/std": 0.430665522813797, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1373.0, + "completions/max_terminated_length": 1373.0, + "completions/mean_length": 431.9821472167969, + "completions/mean_terminated_length": 431.9821472167969, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 2.2620582924941965, + "grad_norm": 0.6653684973716736, + "kl": 0.1309814453125, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 256931529.0, + "reward": 1.5334821939468384, + "reward_std": 0.173820361495018, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4615001678466797, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1228.0, + "completions/max_terminated_length": 1228.0, + "completions/mean_length": 418.3125305175781, + "completions/mean_terminated_length": 418.3125305175781, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 2.2630900180551974, + "grad_norm": 0.7461981773376465, + "kl": 0.132080078125, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 257047302.0, + "reward": 1.6566966772079468, + "reward_std": 0.20143945515155792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6566964387893677, + "rewards/curriculum_aware_reward_fn/std": 0.5511043667793274, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 416.5535888671875, + "completions/mean_terminated_length": 416.5535888671875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 2.2641217436161982, + "grad_norm": 0.7070055603981018, + "kl": 0.12548828125, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 257169885.0, + "reward": 1.5910714864730835, + "reward_std": 0.12923413515090942, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131, + "rewards/curriculum_aware_reward_fn/std": 0.44010645151138306, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 498.5535888671875, + "completions/mean_terminated_length": 498.5535888671875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.265153469177199, + "grad_norm": 0.7625406384468079, + "kl": 0.1231689453125, + "learning_rate": 1e-06, + "loss": -0.024, + "num_tokens": 257297620.0, + "reward": 1.3973215818405151, + "reward_std": 0.25870275497436523, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3973214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.39070501923561096, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 468.6964416503906, + "completions/mean_terminated_length": 468.6964416503906, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.2661851947381995, + "grad_norm": 0.8173531293869019, + "kl": 0.1103515625, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 257423803.0, + "reward": 1.415178656578064, + "reward_std": 0.2854968011379242, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4241071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.40929415822029114, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 424.5000305175781, + "completions/mean_terminated_length": 424.5000305175781, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 2.2672169202992003, + "grad_norm": 0.8341377973556519, + "kl": 0.124755859375, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 257530882.0, + "reward": 1.5250000953674316, + "reward_std": 0.14234140515327454, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5250000357627869, + "rewards/curriculum_aware_reward_fn/std": 0.38414525985717773, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 479.26788330078125, + "completions/mean_terminated_length": 479.26788330078125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 2.268248645860201, + "grad_norm": 0.7552391290664673, + "kl": 0.125244140625, + "learning_rate": 1e-06, + "loss": 0.0356, + "num_tokens": 257657598.0, + "reward": 1.3946430683135986, + "reward_std": 0.18035611510276794, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4035714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.3959658443927765, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 402.26788330078125, + "completions/mean_terminated_length": 402.26788330078125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 2.269280371421202, + "grad_norm": 0.8114144802093506, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": 0.0141, + "num_tokens": 257757189.0, + "reward": 1.4589285850524902, + "reward_std": 0.22217056155204773, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45892858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.40560346841812134, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1441.0, + "completions/max_terminated_length": 1441.0, + "completions/mean_length": 511.482177734375, + "completions/mean_terminated_length": 511.482177734375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 2.270312096982203, + "grad_norm": 0.6917448043823242, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 257880723.0, + "reward": 1.3986608982086182, + "reward_std": 0.2185220867395401, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39866071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3975975215435028, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 369.2500305175781, + "completions/mean_terminated_length": 369.2500305175781, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.2713438225432037, + "grad_norm": 0.6788763999938965, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 257983952.0, + "reward": 1.6758930683135986, + "reward_std": 0.12376383692026138, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6758928894996643, + "rewards/curriculum_aware_reward_fn/std": 0.3803141415119171, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1203.0, + "completions/max_terminated_length": 1203.0, + "completions/mean_length": 467.2232360839844, + "completions/mean_terminated_length": 467.2232360839844, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 2.272375548104204, + "grad_norm": 0.7781023383140564, + "kl": 0.113037109375, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 258108059.0, + "reward": 1.4794644117355347, + "reward_std": 0.19456489384174347, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4794642925262451, + "rewards/curriculum_aware_reward_fn/std": 0.3801110088825226, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 414.8839416503906, + "completions/mean_terminated_length": 414.8839416503906, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 2.273407273665205, + "grad_norm": 0.7982087135314941, + "kl": 0.1376953125, + "learning_rate": 1e-06, + "loss": 0.0288, + "num_tokens": 258222489.0, + "reward": 1.4321428537368774, + "reward_std": 0.2655171751976013, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43214288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.4219675660133362, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1294.0, + "completions/max_terminated_length": 1294.0, + "completions/mean_length": 486.33038330078125, + "completions/mean_terminated_length": 486.33038330078125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 2.2744389992262057, + "grad_norm": 0.8439200520515442, + "kl": 0.108642578125, + "learning_rate": 1e-06, + "loss": 0.0346, + "num_tokens": 258344165.0, + "reward": 1.5200893878936768, + "reward_std": 0.23756206035614014, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5200892686843872, + "rewards/curriculum_aware_reward_fn/std": 0.3993496894836426, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1247.0, + "completions/max_terminated_length": 1247.0, + "completions/mean_length": 424.8660888671875, + "completions/mean_terminated_length": 424.8660888671875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.2754707247872066, + "grad_norm": 0.6224795579910278, + "kl": 0.12744140625, + "learning_rate": 1e-06, + "loss": 0.0252, + "num_tokens": 258453021.0, + "reward": 1.5691964626312256, + "reward_std": 0.10045679658651352, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5691964030265808, + "rewards/curriculum_aware_reward_fn/std": 0.4354890286922455, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 474.7232360839844, + "completions/mean_terminated_length": 474.7232360839844, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.2765024503482074, + "grad_norm": 0.8304800987243652, + "kl": 0.12255859375, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 258578773.0, + "reward": 1.497321605682373, + "reward_std": 0.19776052236557007, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4973214268684387, + "rewards/curriculum_aware_reward_fn/std": 0.37565794587135315, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 426.9910888671875, + "completions/mean_terminated_length": 426.9910888671875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 2.2775341759092083, + "grad_norm": 0.7672892808914185, + "kl": 0.134765625, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 258701433.0, + "reward": 1.6191965341567993, + "reward_std": 0.23367580771446228, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6191964745521545, + "rewards/curriculum_aware_reward_fn/std": 0.439196914434433, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 445.1160888671875, + "completions/mean_terminated_length": 445.1160888671875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 2.278565901470209, + "grad_norm": 0.7751359343528748, + "kl": 0.13525390625, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 258827628.0, + "reward": 1.5513393878936768, + "reward_std": 0.29002559185028076, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5513392686843872, + "rewards/curriculum_aware_reward_fn/std": 0.4421721398830414, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1278.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 464.9375305175781, + "completions/mean_terminated_length": 464.9375305175781, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 2.27959762703121, + "grad_norm": 0.6202963590621948, + "kl": 0.1119384765625, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 258950604.0, + "reward": 1.4495537281036377, + "reward_std": 0.18529221415519714, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.4137009382247925, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 474.6339416503906, + "completions/mean_terminated_length": 474.6339416503906, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 2.2806293525922103, + "grad_norm": 0.8014189004898071, + "kl": 0.11669921875, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 259076143.0, + "reward": 1.4544644355773926, + "reward_std": 0.1441306471824646, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45446428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.37719646096229553, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 449.26788330078125, + "completions/mean_terminated_length": 449.26788330078125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 2.281661078153211, + "grad_norm": 0.794110119342804, + "kl": 0.129150390625, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 259191659.0, + "reward": 1.5883928537368774, + "reward_std": 0.16244350373744965, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5883927941322327, + "rewards/curriculum_aware_reward_fn/std": 0.5123687386512756, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 448.3660888671875, + "completions/mean_terminated_length": 448.3660888671875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 2.282692803714212, + "grad_norm": 0.7245016694068909, + "kl": 0.130615234375, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 259313745.0, + "reward": 1.4339287281036377, + "reward_std": 0.18925009667873383, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43392854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.44210872054100037, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 447.21429443359375, + "completions/mean_terminated_length": 447.21429443359375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 2.283724529275213, + "grad_norm": 0.844030499458313, + "kl": 0.136962890625, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 259432657.0, + "reward": 1.4325894117355347, + "reward_std": 0.20370341837406158, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4325892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.42694756388664246, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 937.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 427.90179443359375, + "completions/mean_terminated_length": 427.90179443359375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.2847562548362137, + "grad_norm": 0.7148916125297546, + "kl": 0.127197265625, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 259546844.0, + "reward": 1.5227677822113037, + "reward_std": 0.25406861305236816, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5316964387893677, + "rewards/curriculum_aware_reward_fn/std": 0.4465582072734833, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 485.4285888671875, + "completions/mean_terminated_length": 485.4285888671875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.285787980397214, + "grad_norm": 0.6173986196517944, + "kl": 0.1173095703125, + "learning_rate": 1e-06, + "loss": 0.0327, + "num_tokens": 259672822.0, + "reward": 1.4812500476837158, + "reward_std": 0.1721828132867813, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48125001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.4544140100479126, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1214.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 458.0625305175781, + "completions/mean_terminated_length": 458.0625305175781, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.286819705958215, + "grad_norm": 0.7467107176780701, + "kl": 0.12646484375, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 259791297.0, + "reward": 1.3580358028411865, + "reward_std": 0.12079530954360962, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35803571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3610890507698059, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 455.5535888671875, + "completions/mean_terminated_length": 455.5535888671875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 2.2878514315192158, + "grad_norm": 0.7681455016136169, + "kl": 0.1279296875, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 259909736.0, + "reward": 1.3901787996292114, + "reward_std": 0.19109110534191132, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39017853140830994, + "rewards/curriculum_aware_reward_fn/std": 0.3828437328338623, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 416.0357360839844, + "completions/mean_terminated_length": 416.0357360839844, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 2.2888831570802166, + "grad_norm": 0.8476653099060059, + "kl": 0.13671875, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 260027781.0, + "reward": 1.6607143878936768, + "reward_std": 0.09141557663679123, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6607142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.3630097210407257, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 432.51788330078125, + "completions/mean_terminated_length": 432.51788330078125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 2.2899148826412175, + "grad_norm": 0.8101479411125183, + "kl": 0.1180419921875, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 260153475.0, + "reward": 1.5424107313156128, + "reward_std": 0.20149752497673035, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128, + "rewards/curriculum_aware_reward_fn/std": 0.41078969836235046, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 405.65179443359375, + "completions/mean_terminated_length": 405.65179443359375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.2909466082022183, + "grad_norm": 0.8121070861816406, + "kl": 0.1209716796875, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 260271540.0, + "reward": 1.6486608982086182, + "reward_std": 0.18063603341579437, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6486607193946838, + "rewards/curriculum_aware_reward_fn/std": 0.42066335678100586, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 442.3482360839844, + "completions/mean_terminated_length": 442.3482360839844, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 2.291978333763219, + "grad_norm": 0.677873969078064, + "kl": 0.1268310546875, + "learning_rate": 1e-06, + "loss": -0.0207, + "num_tokens": 260389123.0, + "reward": 1.462053656578064, + "reward_std": 0.2162722647190094, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4620535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.4583187997341156, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1160.0, + "completions/max_terminated_length": 1160.0, + "completions/mean_length": 481.1964416503906, + "completions/mean_terminated_length": 481.1964416503906, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 2.29301005932422, + "grad_norm": 0.6380321979522705, + "kl": 0.116455078125, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 260510959.0, + "reward": 1.6214287281036377, + "reward_std": 0.1788286566734314, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6214286088943481, + "rewards/curriculum_aware_reward_fn/std": 0.4121856689453125, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 390.4732360839844, + "completions/mean_terminated_length": 390.4732360839844, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 2.2940417848852204, + "grad_norm": 0.8966089487075806, + "kl": 0.13671875, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 260621458.0, + "reward": 1.5357143878936768, + "reward_std": 0.21034854650497437, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.4012608826160431, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 472.7410888671875, + "completions/mean_terminated_length": 472.7410888671875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.295073510446221, + "grad_norm": 1.1112465858459473, + "kl": 0.19580078125, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 260750502.0, + "reward": 1.4330357313156128, + "reward_std": 0.1422305405139923, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.40606623888015747, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1136.0, + "completions/max_terminated_length": 1136.0, + "completions/mean_length": 452.6785888671875, + "completions/mean_terminated_length": 452.6785888671875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.296105236007222, + "grad_norm": 0.8665755987167358, + "kl": 0.1282958984375, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 260862172.0, + "reward": 1.536607265472412, + "reward_std": 0.1756691336631775, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5366071462631226, + "rewards/curriculum_aware_reward_fn/std": 0.36772873997688293, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1264.0, + "completions/max_terminated_length": 1264.0, + "completions/mean_length": 445.3750305175781, + "completions/mean_terminated_length": 445.3750305175781, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.297136961568223, + "grad_norm": 0.8168461322784424, + "kl": 0.119140625, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 260981970.0, + "reward": 1.604017972946167, + "reward_std": 0.18865753710269928, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6040178537368774, + "rewards/curriculum_aware_reward_fn/std": 0.42844459414482117, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1155.0, + "completions/max_terminated_length": 1155.0, + "completions/mean_length": 475.0625305175781, + "completions/mean_terminated_length": 475.0625305175781, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 2.2981686871292237, + "grad_norm": 0.7673314213752747, + "kl": 0.1199951171875, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 261108054.0, + "reward": 1.4441964626312256, + "reward_std": 0.21315507590770721, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4441964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.4495115280151367, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1077.0, + "completions/max_terminated_length": 1077.0, + "completions/mean_length": 455.3750305175781, + "completions/mean_terminated_length": 455.3750305175781, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 2.2992004126902246, + "grad_norm": 0.6256400942802429, + "kl": 0.140380859375, + "learning_rate": 1e-06, + "loss": 0.0269, + "num_tokens": 261222063.0, + "reward": 1.375892996788025, + "reward_std": 0.18649274110794067, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37589284777641296, + "rewards/curriculum_aware_reward_fn/std": 0.45102396607398987, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 446.2232360839844, + "completions/mean_terminated_length": 446.2232360839844, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 2.300232138251225, + "grad_norm": 0.7473752498626709, + "kl": 0.1207275390625, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 261335362.0, + "reward": 1.4924107789993286, + "reward_std": 0.18438197672367096, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49241071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.4038781523704529, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1218.0, + "completions/max_terminated_length": 1218.0, + "completions/mean_length": 487.7410888671875, + "completions/mean_terminated_length": 487.7410888671875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.301263863812226, + "grad_norm": 0.8416672348976135, + "kl": 0.1160888671875, + "learning_rate": 1e-06, + "loss": 0.018, + "num_tokens": 261459968.0, + "reward": 1.4116073846817017, + "reward_std": 0.2052520364522934, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41160711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.43090006709098816, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1127.0, + "completions/mean_length": 439.232177734375, + "completions/mean_terminated_length": 406.2882995605469, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.3022955893732266, + "grad_norm": 0.7082986831665039, + "kl": 0.1705322265625, + "learning_rate": 1e-06, + "loss": 0.0575, + "num_tokens": 261574220.0, + "reward": 1.5107144117355347, + "reward_std": 0.24220815300941467, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5285714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.40568673610687256, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 453.0982360839844, + "completions/mean_terminated_length": 453.0982360839844, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 2.3033273149342275, + "grad_norm": 0.7227510809898376, + "kl": 0.128662109375, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 261695828.0, + "reward": 1.6334823369979858, + "reward_std": 0.18725861608982086, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6334820985794067, + "rewards/curriculum_aware_reward_fn/std": 0.3946368098258972, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2562.0, + "completions/max_terminated_length": 2562.0, + "completions/mean_length": 444.6964416503906, + "completions/mean_terminated_length": 444.6964416503906, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.3043590404952283, + "grad_norm": 0.8308013081550598, + "kl": 0.137451171875, + "learning_rate": 1e-06, + "loss": 0.0517, + "num_tokens": 261813408.0, + "reward": 1.4589287042617798, + "reward_std": 0.26964929699897766, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.47678571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.44402584433555603, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 425.9375305175781, + "completions/mean_terminated_length": 425.9375305175781, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.305390766056229, + "grad_norm": 0.772957444190979, + "kl": 0.131103515625, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 261927137.0, + "reward": 1.4250000715255737, + "reward_std": 0.1946423500776291, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43392854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3968464732170105, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 473.7589416503906, + "completions/mean_terminated_length": 441.1261291503906, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 2.30642249161723, + "grad_norm": 0.7113568782806396, + "kl": 0.11328125, + "learning_rate": 1e-06, + "loss": 0.0474, + "num_tokens": 262055840.0, + "reward": 1.4651787281036377, + "reward_std": 0.2366374433040619, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.47410717606544495, + "rewards/curriculum_aware_reward_fn/std": 0.43959349393844604, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1298.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 500.0535888671875, + "completions/mean_terminated_length": 500.0535888671875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 2.3074542171782304, + "grad_norm": 0.7893040776252747, + "kl": 0.10888671875, + "learning_rate": 1e-06, + "loss": 0.0275, + "num_tokens": 262179844.0, + "reward": 1.3928571939468384, + "reward_std": 0.22476813197135925, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3928571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4019578993320465, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 500.2589416503906, + "completions/mean_terminated_length": 500.2589416503906, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.3084859427392312, + "grad_norm": 0.7808545827865601, + "kl": 0.12548828125, + "learning_rate": 1e-06, + "loss": -0.0198, + "num_tokens": 262315743.0, + "reward": 1.3504464626312256, + "reward_std": 0.2415580451488495, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.359375, + "rewards/curriculum_aware_reward_fn/std": 0.39218324422836304, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 445.33929443359375, + "completions/mean_terminated_length": 445.33929443359375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.309517668300232, + "grad_norm": 0.8054758906364441, + "kl": 0.1317138671875, + "learning_rate": 1e-06, + "loss": -0.0297, + "num_tokens": 262437539.0, + "reward": 1.5839285850524902, + "reward_std": 0.25896844267845154, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5839285850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4052145481109619, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1190.0, + "completions/max_terminated_length": 1190.0, + "completions/mean_length": 464.96429443359375, + "completions/mean_terminated_length": 464.96429443359375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 2.310549393861233, + "grad_norm": 0.5957467555999756, + "kl": 0.1160888671875, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 262550046.0, + "reward": 1.5642858743667603, + "reward_std": 0.14827460050582886, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5642856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.3665291666984558, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1060.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 480.6875305175781, + "completions/mean_terminated_length": 480.6875305175781, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 2.3115811194222338, + "grad_norm": 0.7734221816062927, + "kl": 0.122802734375, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 262675164.0, + "reward": 1.3772321939468384, + "reward_std": 0.23709361255168915, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3772321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.44153133034706116, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1401.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 484.4464416503906, + "completions/mean_terminated_length": 484.4464416503906, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 2.3126128449832346, + "grad_norm": 0.611352264881134, + "kl": 0.1220703125, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 262796594.0, + "reward": 1.5602680444717407, + "reward_std": 0.2347055971622467, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5691964030265808, + "rewards/curriculum_aware_reward_fn/std": 0.4562007784843445, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1010.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 409.9732360839844, + "completions/mean_terminated_length": 409.9732360839844, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 2.313644570544235, + "grad_norm": 0.7636814117431641, + "kl": 0.134521484375, + "learning_rate": 1e-06, + "loss": 0.0353, + "num_tokens": 262907338.0, + "reward": 1.5718750953674316, + "reward_std": 0.25764763355255127, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5718750357627869, + "rewards/curriculum_aware_reward_fn/std": 0.46230822801589966, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1714.0, + "completions/max_terminated_length": 1714.0, + "completions/mean_length": 509.14288330078125, + "completions/mean_terminated_length": 509.14288330078125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 2.314676296105236, + "grad_norm": 0.8017022013664246, + "kl": 0.1229248046875, + "learning_rate": 1e-06, + "loss": -0.0261, + "num_tokens": 263037484.0, + "reward": 1.4218751192092896, + "reward_std": 0.193229541182518, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.421875, + "rewards/curriculum_aware_reward_fn/std": 0.37119126319885254, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1883.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 455.5535888671875, + "completions/mean_terminated_length": 455.5535888671875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 2.3157080216662367, + "grad_norm": 0.780221164226532, + "kl": 0.1318359375, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 263157379.0, + "reward": 1.555803656578064, + "reward_std": 0.23983052372932434, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5558035373687744, + "rewards/curriculum_aware_reward_fn/std": 0.43088769912719727, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 393.4196472167969, + "completions/mean_terminated_length": 393.4196472167969, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 2.3167397472272375, + "grad_norm": 0.7895362973213196, + "kl": 0.141845703125, + "learning_rate": 1e-06, + "loss": -0.0141, + "num_tokens": 263264925.0, + "reward": 1.599107265472412, + "reward_std": 0.1769608110189438, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.6169642806053162, + "rewards/curriculum_aware_reward_fn/std": 0.4076163172721863, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 434.7410888671875, + "completions/mean_terminated_length": 434.7410888671875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 2.3177714727882384, + "grad_norm": 0.677725613117218, + "kl": 0.13525390625, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 263372675.0, + "reward": 1.3598215579986572, + "reward_std": 0.14816580712795258, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3598214089870453, + "rewards/curriculum_aware_reward_fn/std": 0.3798908591270447, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 457.8839416503906, + "completions/mean_terminated_length": 457.8839416503906, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 2.318803198349239, + "grad_norm": 0.7738561034202576, + "kl": 0.1317138671875, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 263488267.0, + "reward": 1.5080358982086182, + "reward_std": 0.2324211597442627, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5080357193946838, + "rewards/curriculum_aware_reward_fn/std": 0.42487698793411255, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 414.21429443359375, + "completions/mean_terminated_length": 414.21429443359375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.31983492391024, + "grad_norm": 0.7853266596794128, + "kl": 0.1280517578125, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 263593515.0, + "reward": 1.641517996788025, + "reward_std": 0.1926991194486618, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6415179371833801, + "rewards/curriculum_aware_reward_fn/std": 0.42521265149116516, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3256.0, + "completions/max_terminated_length": 3256.0, + "completions/mean_length": 452.5535888671875, + "completions/mean_terminated_length": 452.5535888671875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.320866649471241, + "grad_norm": 0.6526082158088684, + "kl": 0.13037109375, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 263714845.0, + "reward": 1.4147322177886963, + "reward_std": 0.13459168374538422, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4147321581840515, + "rewards/curriculum_aware_reward_fn/std": 0.4585837125778198, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2341.0, + "completions/max_terminated_length": 2341.0, + "completions/mean_length": 583.8214721679688, + "completions/mean_terminated_length": 583.8214721679688, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 2.3218983750322413, + "grad_norm": 0.6811022162437439, + "kl": 0.1121826171875, + "learning_rate": 1e-06, + "loss": -0.0076, + "num_tokens": 263856484.0, + "reward": 1.2991071939468384, + "reward_std": 0.17378173768520355, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3080357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.3994677662849426, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1736.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 508.9910888671875, + "completions/mean_terminated_length": 508.9910888671875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 2.322930100593242, + "grad_norm": 0.8488061428070068, + "kl": 0.1119384765625, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 263986858.0, + "reward": 1.4433037042617798, + "reward_std": 0.19707278907299042, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44330358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.4065065085887909, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 469.1250305175781, + "completions/mean_terminated_length": 469.1250305175781, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 2.323961826154243, + "grad_norm": 0.7594837546348572, + "kl": 0.1251220703125, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 264111955.0, + "reward": 1.4861608743667603, + "reward_std": 0.2393674999475479, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.49508926272392273, + "rewards/curriculum_aware_reward_fn/std": 0.4587958753108978, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 482.4910888671875, + "completions/mean_terminated_length": 482.4910888671875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.324993551715244, + "grad_norm": 0.8010709881782532, + "kl": 0.1263427734375, + "learning_rate": 1e-06, + "loss": 0.0297, + "num_tokens": 264225312.0, + "reward": 1.3026787042617798, + "reward_std": 0.1675240397453308, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30267858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.32081252336502075, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1125.0, + "completions/max_terminated_length": 1125.0, + "completions/mean_length": 484.5714416503906, + "completions/mean_terminated_length": 484.5714416503906, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 2.3260252772762446, + "grad_norm": 0.7815201878547668, + "kl": 0.111572265625, + "learning_rate": 1e-06, + "loss": -0.0204, + "num_tokens": 264345083.0, + "reward": 1.400892972946167, + "reward_std": 0.2504502832889557, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.41875001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.4531235098838806, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1900.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 509.9107360839844, + "completions/mean_terminated_length": 509.9107360839844, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 2.3270570028372455, + "grad_norm": 0.7713619470596313, + "kl": 0.129150390625, + "learning_rate": 1e-06, + "loss": 0.0363, + "num_tokens": 264469722.0, + "reward": 1.427232265472412, + "reward_std": 0.1826336532831192, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42723211646080017, + "rewards/curriculum_aware_reward_fn/std": 0.3809003233909607, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 414.51788330078125, + "completions/mean_terminated_length": 414.51788330078125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.328088728398246, + "grad_norm": 0.6340947151184082, + "kl": 0.127685546875, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 264579407.0, + "reward": 1.6540179252624512, + "reward_std": 0.11283912509679794, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6540178656578064, + "rewards/curriculum_aware_reward_fn/std": 0.42849716544151306, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1256.0, + "completions/max_terminated_length": 1256.0, + "completions/mean_length": 479.4464416503906, + "completions/mean_terminated_length": 479.4464416503906, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 2.3291204539592467, + "grad_norm": 0.7967779040336609, + "kl": 0.1357421875, + "learning_rate": 1e-06, + "loss": -0.0253, + "num_tokens": 264713101.0, + "reward": 1.4651787281036377, + "reward_std": 0.19649997353553772, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.47410711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.3908861577510834, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1696.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 446.857177734375, + "completions/mean_terminated_length": 446.857177734375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 2.3301521795202476, + "grad_norm": 0.7497438788414001, + "kl": 0.1353759765625, + "learning_rate": 1e-06, + "loss": 0.0182, + "num_tokens": 264834654.0, + "reward": 1.6334823369979858, + "reward_std": 0.25653204321861267, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6334820985794067, + "rewards/curriculum_aware_reward_fn/std": 0.3811663091182709, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1125.0, + "completions/max_terminated_length": 1125.0, + "completions/mean_length": 455.8482360839844, + "completions/mean_terminated_length": 455.8482360839844, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 2.3311839050812484, + "grad_norm": 0.8482918739318848, + "kl": 0.1436767578125, + "learning_rate": 1e-06, + "loss": -0.0481, + "num_tokens": 264950536.0, + "reward": 1.4495537281036377, + "reward_std": 0.22892731428146362, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3629518449306488, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1127.0, + "completions/max_terminated_length": 1127.0, + "completions/mean_length": 439.6785888671875, + "completions/mean_terminated_length": 439.6785888671875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 2.3322156306422492, + "grad_norm": 0.7075839042663574, + "kl": 0.130615234375, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 265072720.0, + "reward": 1.532589316368103, + "reward_std": 0.18878993391990662, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103, + "rewards/curriculum_aware_reward_fn/std": 0.4554331600666046, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 441.6785888671875, + "completions/mean_terminated_length": 441.6785888671875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 2.33324735620325, + "grad_norm": 0.7362609505653381, + "kl": 0.138671875, + "learning_rate": 1e-06, + "loss": 0.0207, + "num_tokens": 265199881.0, + "reward": 1.5388394594192505, + "reward_std": 0.23343265056610107, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5388392806053162, + "rewards/curriculum_aware_reward_fn/std": 0.44187918305397034, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1492.0, + "completions/max_terminated_length": 1492.0, + "completions/mean_length": 487.6160888671875, + "completions/mean_terminated_length": 487.6160888671875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.334279081764251, + "grad_norm": 0.6873180270195007, + "kl": 0.1212158203125, + "learning_rate": 1e-06, + "loss": -0.0413, + "num_tokens": 265313763.0, + "reward": 1.5026787519454956, + "reward_std": 0.15679128468036652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5026785731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4019568860530853, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1436.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 500.4375305175781, + "completions/mean_terminated_length": 500.4375305175781, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 2.3353108073252513, + "grad_norm": 0.7277102470397949, + "kl": 0.1156005859375, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 265439469.0, + "reward": 1.4388394355773926, + "reward_std": 0.2303120493888855, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43883928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3845384418964386, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 453.33929443359375, + "completions/mean_terminated_length": 453.33929443359375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 2.336342532886252, + "grad_norm": 0.7865290641784668, + "kl": 0.14306640625, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 265553516.0, + "reward": 1.4803574085235596, + "reward_std": 0.18225398659706116, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48035717010498047, + "rewards/curriculum_aware_reward_fn/std": 0.35938870906829834, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 477.8660888671875, + "completions/mean_terminated_length": 477.8660888671875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.337374258447253, + "grad_norm": 0.6291416883468628, + "kl": 0.12646484375, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 265678626.0, + "reward": 1.3834823369979858, + "reward_std": 0.18859294056892395, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3834821283817291, + "rewards/curriculum_aware_reward_fn/std": 0.424231618642807, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1324.0, + "completions/max_terminated_length": 1324.0, + "completions/mean_length": 446.9732360839844, + "completions/mean_terminated_length": 446.9732360839844, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.338405984008254, + "grad_norm": 0.775569498538971, + "kl": 0.1163330078125, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 265794193.0, + "reward": 1.4651787281036377, + "reward_std": 0.15202991664409637, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46517857909202576, + "rewards/curriculum_aware_reward_fn/std": 0.43047425150871277, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1233.0, + "completions/max_terminated_length": 1233.0, + "completions/mean_length": 443.83038330078125, + "completions/mean_terminated_length": 443.83038330078125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 2.3394377095692547, + "grad_norm": 0.8700284957885742, + "kl": 0.13916015625, + "learning_rate": 1e-06, + "loss": -0.0251, + "num_tokens": 265907847.0, + "reward": 1.4928573369979858, + "reward_std": 0.23910239338874817, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5017857551574707, + "rewards/curriculum_aware_reward_fn/std": 0.4329829812049866, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 479.169677734375, + "completions/mean_terminated_length": 479.169677734375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 2.3404694351302555, + "grad_norm": 0.848242461681366, + "kl": 0.138671875, + "learning_rate": 1e-06, + "loss": 0.0144, + "num_tokens": 266036232.0, + "reward": 1.4566963911056519, + "reward_std": 0.18905363976955414, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45669645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.4086616039276123, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2355.0, + "completions/max_terminated_length": 2355.0, + "completions/mean_length": 502.2589416503906, + "completions/mean_terminated_length": 502.2589416503906, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 2.341501160691256, + "grad_norm": 0.6625986695289612, + "kl": 0.119873046875, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 266163913.0, + "reward": 1.3316965103149414, + "reward_std": 0.17715652287006378, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.34062501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.38370707631111145, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 425.3035888671875, + "completions/mean_terminated_length": 425.3035888671875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.3425328862522568, + "grad_norm": 0.8239805698394775, + "kl": 0.136962890625, + "learning_rate": 1e-06, + "loss": -0.0156, + "num_tokens": 266276549.0, + "reward": 1.4437501430511475, + "reward_std": 0.17334595322608948, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4437499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.4235268533229828, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1084.0, + "completions/max_terminated_length": 1084.0, + "completions/mean_length": 455.14288330078125, + "completions/mean_terminated_length": 455.14288330078125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 2.3435646118132576, + "grad_norm": 0.7383701801300049, + "kl": 0.1275634765625, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 266399905.0, + "reward": 1.5540181398391724, + "reward_std": 0.2154999077320099, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5540178418159485, + "rewards/curriculum_aware_reward_fn/std": 0.44261398911476135, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1198.0, + "completions/max_terminated_length": 1198.0, + "completions/mean_length": 447.4285888671875, + "completions/mean_terminated_length": 447.4285888671875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 2.3445963373742584, + "grad_norm": 0.7752024531364441, + "kl": 0.134521484375, + "learning_rate": 1e-06, + "loss": 0.0281, + "num_tokens": 266518057.0, + "reward": 1.5308037996292114, + "reward_std": 0.17774909734725952, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5308035612106323, + "rewards/curriculum_aware_reward_fn/std": 0.41731905937194824, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1265.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 462.9910888671875, + "completions/mean_terminated_length": 462.9910888671875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 2.3456280629352593, + "grad_norm": 0.7307128310203552, + "kl": 0.1312255859375, + "learning_rate": 1e-06, + "loss": -0.0224, + "num_tokens": 266633808.0, + "reward": 1.4352679252624512, + "reward_std": 0.16883431375026703, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.42312854528427124, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2141.0, + "completions/mean_length": 510.8750305175781, + "completions/mean_terminated_length": 478.57659912109375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 2.34665978849626, + "grad_norm": 0.6801409125328064, + "kl": 0.13232421875, + "learning_rate": 1e-06, + "loss": -0.036, + "num_tokens": 266755300.0, + "reward": 1.5232144594192505, + "reward_std": 0.12608736753463745, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5232142806053162, + "rewards/curriculum_aware_reward_fn/std": 0.4231972098350525, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2522.0, + "completions/max_terminated_length": 2522.0, + "completions/mean_length": 495.5535888671875, + "completions/mean_terminated_length": 495.5535888671875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.347691514057261, + "grad_norm": 0.8027746677398682, + "kl": 0.123046875, + "learning_rate": 1e-06, + "loss": 0.0518, + "num_tokens": 266875237.0, + "reward": 1.4709821939468384, + "reward_std": 0.1888292133808136, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4709821343421936, + "rewards/curriculum_aware_reward_fn/std": 0.44095709919929504, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1802.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 440.08038330078125, + "completions/mean_terminated_length": 440.08038330078125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.3487232396182613, + "grad_norm": 0.8603218793869019, + "kl": 0.140380859375, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 266987414.0, + "reward": 1.5241072177886963, + "reward_std": 0.1688319742679596, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5330356955528259, + "rewards/curriculum_aware_reward_fn/std": 0.42323046922683716, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1166.0, + "completions/max_terminated_length": 1166.0, + "completions/mean_length": 425.5357360839844, + "completions/mean_terminated_length": 425.5357360839844, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 2.349754965179262, + "grad_norm": 0.7801834344863892, + "kl": 0.1314697265625, + "learning_rate": 1e-06, + "loss": 0.0518, + "num_tokens": 267094175.0, + "reward": 1.6169644594192505, + "reward_std": 0.25597792863845825, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6258928179740906, + "rewards/curriculum_aware_reward_fn/std": 0.43199628591537476, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3730.0, + "completions/max_terminated_length": 3730.0, + "completions/mean_length": 505.8125305175781, + "completions/mean_terminated_length": 505.8125305175781, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 2.350786690740263, + "grad_norm": 1.1834087371826172, + "kl": 0.130126953125, + "learning_rate": 1e-06, + "loss": -0.0245, + "num_tokens": 267217123.0, + "reward": 1.4629465341567993, + "reward_std": 0.17395009100437164, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46294641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.43947890400886536, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1123.0, + "completions/max_terminated_length": 1123.0, + "completions/mean_length": 482.9285888671875, + "completions/mean_terminated_length": 482.9285888671875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.351818416301264, + "grad_norm": 0.8050830960273743, + "kl": 0.13671875, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 267333679.0, + "reward": 1.531250238418579, + "reward_std": 0.21188195049762726, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.53125, + "rewards/curriculum_aware_reward_fn/std": 0.3829782009124756, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 497.0089416503906, + "completions/mean_terminated_length": 464.5856018066406, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.3528501418622647, + "grad_norm": 0.6472569704055786, + "kl": 0.1285400390625, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 267445897.0, + "reward": 1.46473228931427, + "reward_std": 0.18970660865306854, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4736607074737549, + "rewards/curriculum_aware_reward_fn/std": 0.4229859709739685, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1989.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 434.9375305175781, + "completions/mean_terminated_length": 434.9375305175781, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 2.3538818674232656, + "grad_norm": 0.8479146957397461, + "kl": 0.141357421875, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 267565982.0, + "reward": 1.579017996788025, + "reward_std": 0.23399677872657776, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5790179371833801, + "rewards/curriculum_aware_reward_fn/std": 0.39178720116615295, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2161.0, + "completions/max_terminated_length": 2161.0, + "completions/mean_length": 502.1339416503906, + "completions/mean_terminated_length": 502.1339416503906, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.354913592984266, + "grad_norm": 0.763183057308197, + "kl": 0.13623046875, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 267687636.0, + "reward": 1.3544644117355347, + "reward_std": 0.19222049415111542, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3544642925262451, + "rewards/curriculum_aware_reward_fn/std": 0.38663211464881897, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3179.0, + "completions/mean_length": 527.7589721679688, + "completions/mean_terminated_length": 495.61260986328125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.355945318545267, + "grad_norm": 0.7211725115776062, + "kl": 0.139892578125, + "learning_rate": 1e-06, + "loss": 0.0191, + "num_tokens": 267812230.0, + "reward": 1.559821605682373, + "reward_std": 0.24818488955497742, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5687500238418579, + "rewards/curriculum_aware_reward_fn/std": 0.45182934403419495, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2126.0, + "completions/mean_length": 521.1964721679688, + "completions/mean_terminated_length": 456.1999816894531, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 2.3569770441062676, + "grad_norm": 0.6131947636604309, + "kl": 0.11376953125, + "learning_rate": 1e-06, + "loss": -0.0256, + "num_tokens": 267944300.0, + "reward": 1.596428632736206, + "reward_std": 0.1304352879524231, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5964285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.43701043725013733, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1044.0, + "completions/mean_length": 488.52679443359375, + "completions/mean_terminated_length": 456.02703857421875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.3580087696672685, + "grad_norm": 0.7710050344467163, + "kl": 0.13134765625, + "learning_rate": 1e-06, + "loss": 0.0432, + "num_tokens": 268057549.0, + "reward": 1.5428574085235596, + "reward_std": 0.20850172638893127, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5517857670783997, + "rewards/curriculum_aware_reward_fn/std": 0.38717785477638245, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2987.0, + "completions/mean_length": 477.2410888671875, + "completions/mean_terminated_length": 444.6396484375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 2.3590404952282693, + "grad_norm": 0.66845703125, + "kl": 0.1307373046875, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 268173615.0, + "reward": 1.6566965579986572, + "reward_std": 0.12179917097091675, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6566964387893677, + "rewards/curriculum_aware_reward_fn/std": 0.41554778814315796, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3478.0, + "completions/max_terminated_length": 3478.0, + "completions/mean_length": 483.15179443359375, + "completions/mean_terminated_length": 483.15179443359375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.36007222078927, + "grad_norm": 0.7215074300765991, + "kl": 0.124267578125, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 268292324.0, + "reward": 1.5687501430511475, + "reward_std": 0.18647773563861847, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5687499642372131, + "rewards/curriculum_aware_reward_fn/std": 0.46938356757164, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2271.0, + "completions/mean_length": 580.8214721679688, + "completions/mean_terminated_length": 516.9090576171875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 2.361103946350271, + "grad_norm": 0.7281975150108337, + "kl": 0.13330078125, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 268422973.0, + "reward": 1.333035945892334, + "reward_std": 0.22421413660049438, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3419643044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3631415367126465, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1779.0, + "completions/mean_length": 479.3482360839844, + "completions/mean_terminated_length": 446.7657775878906, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 2.362135671911272, + "grad_norm": 0.829347550868988, + "kl": 0.155517578125, + "learning_rate": 1e-06, + "loss": -0.0273, + "num_tokens": 268533493.0, + "reward": 1.5982143878936768, + "reward_std": 0.19711095094680786, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6071428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.42462795972824097, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 573.6964721679688, + "completions/mean_terminated_length": 476.75225830078125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 2.3631673974722722, + "grad_norm": 0.7868594527244568, + "kl": 0.130126953125, + "learning_rate": 1e-06, + "loss": 0.0456, + "num_tokens": 268666263.0, + "reward": 1.456696629524231, + "reward_std": 0.2679513692855835, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45669645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.4117363393306732, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1440.0, + "completions/max_terminated_length": 1440.0, + "completions/mean_length": 471.52679443359375, + "completions/mean_terminated_length": 471.52679443359375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.364199123033273, + "grad_norm": 0.8038163781166077, + "kl": 0.140625, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 268788206.0, + "reward": 1.5169644355773926, + "reward_std": 0.21957655251026154, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5348213911056519, + "rewards/curriculum_aware_reward_fn/std": 0.44331061840057373, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3965.0, + "completions/mean_length": 577.1875, + "completions/mean_terminated_length": 480.33941650390625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 2.365230848594274, + "grad_norm": 0.6000884175300598, + "kl": 0.145263671875, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 268917678.0, + "reward": 1.5205358266830444, + "reward_std": 0.10521666705608368, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5205357074737549, + "rewards/curriculum_aware_reward_fn/std": 0.4275718927383423, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1222.0, + "completions/mean_length": 442.0982360839844, + "completions/mean_terminated_length": 409.18017578125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 2.3662625741552747, + "grad_norm": 0.7253805994987488, + "kl": 0.145751953125, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 269031334.0, + "reward": 1.6093751192092896, + "reward_std": 0.17531929910182953, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.609375, + "rewards/curriculum_aware_reward_fn/std": 0.38744550943374634, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2273.0, + "completions/mean_length": 560.7232666015625, + "completions/mean_terminated_length": 528.8739013671875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 2.3672942997162756, + "grad_norm": 0.6465750932693481, + "kl": 0.127685546875, + "learning_rate": 1e-06, + "loss": 0.0492, + "num_tokens": 269163176.0, + "reward": 1.4004465341567993, + "reward_std": 0.19433052837848663, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40044641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.3976561725139618, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1605.0, + "completions/max_terminated_length": 1605.0, + "completions/mean_length": 472.6607360839844, + "completions/mean_terminated_length": 472.6607360839844, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 2.3683260252772764, + "grad_norm": 0.7045727372169495, + "kl": 0.1318359375, + "learning_rate": 1e-06, + "loss": -0.0369, + "num_tokens": 269284777.0, + "reward": 1.4316965341567993, + "reward_std": 0.2325831949710846, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.42162978649139404, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1876.0, + "completions/mean_length": 544.6964721679688, + "completions/mean_terminated_length": 446.9541015625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 2.369357750838277, + "grad_norm": 0.7127491235733032, + "kl": 0.1331787109375, + "learning_rate": 1e-06, + "loss": -0.0304, + "num_tokens": 269415090.0, + "reward": 1.4366071224212646, + "reward_std": 0.18148307502269745, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43660715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.4378480315208435, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2440.0, + "completions/mean_length": 551.4732666015625, + "completions/mean_terminated_length": 487.0272521972656, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.3703894763992777, + "grad_norm": 0.5928733348846436, + "kl": 0.138916015625, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 269546302.0, + "reward": 1.4285714626312256, + "reward_std": 0.18738706409931183, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4375, + "rewards/curriculum_aware_reward_fn/std": 0.43024715781211853, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 522.6607666015625, + "completions/mean_terminated_length": 490.4684753417969, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 2.3714212019602785, + "grad_norm": 0.6640140414237976, + "kl": 0.140380859375, + "learning_rate": 1e-06, + "loss": 0.0715, + "num_tokens": 269676604.0, + "reward": 1.4602679014205933, + "reward_std": 0.06861122697591782, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4691964089870453, + "rewards/curriculum_aware_reward_fn/std": 0.4369347095489502, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3210.0, + "completions/max_terminated_length": 3210.0, + "completions/mean_length": 555.9553833007812, + "completions/mean_terminated_length": 555.9553833007812, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 2.3724529275212793, + "grad_norm": 0.6696567535400391, + "kl": 0.131103515625, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 269805119.0, + "reward": 1.474107265472412, + "reward_std": 0.24978794157505035, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47410711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.4368695914745331, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1357.0, + "completions/mean_length": 597.0267944335938, + "completions/mean_terminated_length": 500.7247619628906, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 2.37348465308228, + "grad_norm": 0.6863952279090881, + "kl": 0.142578125, + "learning_rate": 1e-06, + "loss": -0.0474, + "num_tokens": 269934734.0, + "reward": 1.4522322416305542, + "reward_std": 0.17944176495075226, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.4349789321422577, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1239.0, + "completions/max_terminated_length": 1239.0, + "completions/mean_length": 467.232177734375, + "completions/mean_terminated_length": 467.232177734375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 2.374516378643281, + "grad_norm": 0.7441481351852417, + "kl": 0.144775390625, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 270053109.0, + "reward": 1.5754464864730835, + "reward_std": 0.2411445826292038, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5754464268684387, + "rewards/curriculum_aware_reward_fn/std": 0.41940516233444214, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3599.0, + "completions/mean_length": 611.8660888671875, + "completions/mean_terminated_length": 548.5181884765625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.375548104204282, + "grad_norm": 0.5696609020233154, + "kl": 0.132568359375, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 270193864.0, + "reward": 1.5214287042617798, + "reward_std": 0.14514337480068207, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5214285850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4346332550048828, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3726.0, + "completions/mean_length": 564.125, + "completions/mean_terminated_length": 499.9090881347656, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 2.3765798297652823, + "grad_norm": 0.7344022989273071, + "kl": 0.148681640625, + "learning_rate": 1e-06, + "loss": -0.0311, + "num_tokens": 270328600.0, + "reward": 1.5705357789993286, + "reward_std": 0.25526320934295654, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5705357193946838, + "rewards/curriculum_aware_reward_fn/std": 0.5795910954475403, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 550.0625, + "completions/mean_terminated_length": 485.59088134765625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 2.377611555326283, + "grad_norm": 0.7728509306907654, + "kl": 0.1435546875, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 270447824.0, + "reward": 1.5660713911056519, + "reward_std": 0.2546859383583069, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5660714507102966, + "rewards/curriculum_aware_reward_fn/std": 0.4277646541595459, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3232.0, + "completions/mean_length": 610.5535888671875, + "completions/mean_terminated_length": 547.1818237304688, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.378643280887284, + "grad_norm": 0.650519073009491, + "kl": 0.1356201171875, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 270597035.0, + "reward": 1.6129463911056519, + "reward_std": 0.170953169465065, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6218749284744263, + "rewards/curriculum_aware_reward_fn/std": 0.42297643423080444, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1710.0, + "completions/mean_length": 560.6517944335938, + "completions/mean_terminated_length": 496.3727111816406, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.379675006448285, + "grad_norm": 4.208312511444092, + "kl": 0.671630859375, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 270725215.0, + "reward": 1.4825893640518188, + "reward_std": 0.24740009009838104, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4915178120136261, + "rewards/curriculum_aware_reward_fn/std": 0.4709530472755432, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 612.625, + "completions/mean_terminated_length": 549.2908935546875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.3807067320092856, + "grad_norm": 0.5898601412773132, + "kl": 0.1246337890625, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 270861036.0, + "reward": 1.501339316368103, + "reward_std": 0.14689765870571136, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5102678537368774, + "rewards/curriculum_aware_reward_fn/std": 0.4506821036338806, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2972.0, + "completions/max_terminated_length": 2972.0, + "completions/mean_length": 506.4375305175781, + "completions/mean_terminated_length": 506.4375305175781, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.3817384575702865, + "grad_norm": 0.7323086857795715, + "kl": 0.150634765625, + "learning_rate": 1e-06, + "loss": 0.0484, + "num_tokens": 270982269.0, + "reward": 1.5071427822113037, + "reward_std": 0.18541446328163147, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5160714387893677, + "rewards/curriculum_aware_reward_fn/std": 0.4374997615814209, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1938.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 574.0357666015625, + "completions/mean_terminated_length": 574.0357666015625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 2.382770183131287, + "grad_norm": 0.7194310426712036, + "kl": 0.1356201171875, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 271117268.0, + "reward": 1.3991073369979858, + "reward_std": 0.21801282465457916, + "rewards/code_format_reward/mean": 0.9642857313156128, + "rewards/code_format_reward/std": 0.18641091883182526, + "rewards/curriculum_aware_reward_fn/mean": 0.43482139706611633, + "rewards/curriculum_aware_reward_fn/std": 0.39540374279022217, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2293.0, + "completions/mean_length": 613.232177734375, + "completions/mean_terminated_length": 549.9090576171875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 2.3838019086922877, + "grad_norm": 0.7925401329994202, + "kl": 0.14013671875, + "learning_rate": 1e-06, + "loss": 0.0518, + "num_tokens": 271259478.0, + "reward": 1.4066966772079468, + "reward_std": 0.3506569266319275, + "rewards/code_format_reward/mean": 0.8928571343421936, + "rewards/code_format_reward/std": 0.3106848895549774, + "rewards/curriculum_aware_reward_fn/mean": 0.5138393044471741, + "rewards/curriculum_aware_reward_fn/std": 0.4164527356624603, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2534.0, + "completions/max_terminated_length": 2534.0, + "completions/mean_length": 562.2232666015625, + "completions/mean_terminated_length": 562.2232666015625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 2.3848336342532885, + "grad_norm": 0.7765570878982544, + "kl": 0.13818359375, + "learning_rate": 1e-06, + "loss": 0.0486, + "num_tokens": 271388646.0, + "reward": 1.3718751668930054, + "reward_std": 0.38008591532707214, + "rewards/code_format_reward/mean": 0.8392857313156128, + "rewards/code_format_reward/std": 0.368917852640152, + "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103, + "rewards/curriculum_aware_reward_fn/std": 0.44865715503692627, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2419.0, + "completions/mean_length": 662.4107666015625, + "completions/mean_terminated_length": 599.9818115234375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 2.3858653598142894, + "grad_norm": 0.8191332221031189, + "kl": 0.136962890625, + "learning_rate": 1e-06, + "loss": -0.0209, + "num_tokens": 271534292.0, + "reward": 1.2691963911056519, + "reward_std": 0.47810545563697815, + "rewards/code_format_reward/mean": 0.7678571343421936, + "rewards/code_format_reward/std": 0.4240972101688385, + "rewards/curriculum_aware_reward_fn/mean": 0.501339316368103, + "rewards/curriculum_aware_reward_fn/std": 0.4248189926147461, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1816.0, + "completions/max_terminated_length": 1816.0, + "completions/mean_length": 568.9017944335938, + "completions/mean_terminated_length": 568.9017944335938, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 2.38689708537529, + "grad_norm": 0.8380082845687866, + "kl": 0.137939453125, + "learning_rate": 1e-06, + "loss": 0.0386, + "num_tokens": 271667708.0, + "reward": 1.2825894355773926, + "reward_std": 0.5124881863594055, + "rewards/code_format_reward/mean": 0.7857142686843872, + "rewards/code_format_reward/std": 0.41217005252838135, + "rewards/curriculum_aware_reward_fn/mean": 0.49687501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3966231942176819, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 542.6517944335938, + "completions/mean_terminated_length": 510.6396484375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.387928810936291, + "grad_norm": 0.7812418937683105, + "kl": 0.1484375, + "learning_rate": 1e-06, + "loss": -0.0299, + "num_tokens": 271790456.0, + "reward": 1.4799107313156128, + "reward_std": 0.3694676160812378, + "rewards/code_format_reward/mean": 0.9017857313156128, + "rewards/code_format_reward/std": 0.2989417314529419, + "rewards/curriculum_aware_reward_fn/mean": 0.578125, + "rewards/curriculum_aware_reward_fn/std": 0.3827226459980011, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 589.4732666015625, + "completions/mean_terminated_length": 557.8828735351562, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 2.388960536497292, + "grad_norm": 0.7020928263664246, + "kl": 0.137939453125, + "learning_rate": 1e-06, + "loss": 0.0453, + "num_tokens": 271924720.0, + "reward": 1.4084821939468384, + "reward_std": 0.2678705155849457, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4016653597354889, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1615.0, + "completions/mean_length": 577.5892944335938, + "completions/mean_terminated_length": 545.8919067382812, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.3899922620582927, + "grad_norm": 0.7192690968513489, + "kl": 0.149169921875, + "learning_rate": 1e-06, + "loss": 0.055, + "num_tokens": 272055378.0, + "reward": 1.466071605682373, + "reward_std": 0.21608500182628632, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.48392853140830994, + "rewards/curriculum_aware_reward_fn/std": 0.43827131390571594, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2519.0, + "completions/max_terminated_length": 2519.0, + "completions/mean_length": 522.3482666015625, + "completions/mean_terminated_length": 522.3482666015625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.391023987619293, + "grad_norm": 0.743002712726593, + "kl": 0.139404296875, + "learning_rate": 1e-06, + "loss": 0.0163, + "num_tokens": 272180584.0, + "reward": 1.3870537281036377, + "reward_std": 0.18511782586574554, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.40491071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.36223530769348145, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2154.0, + "completions/max_terminated_length": 2154.0, + "completions/mean_length": 482.33038330078125, + "completions/mean_terminated_length": 482.33038330078125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 2.392055713180294, + "grad_norm": 0.6114434599876404, + "kl": 0.136474609375, + "learning_rate": 1e-06, + "loss": -0.0142, + "num_tokens": 272306159.0, + "reward": 1.5455358028411865, + "reward_std": 0.10450504720211029, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5633928179740906, + "rewards/curriculum_aware_reward_fn/std": 0.4080265760421753, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1303.0, + "completions/max_terminated_length": 1303.0, + "completions/mean_length": 536.3660888671875, + "completions/mean_terminated_length": 536.3660888671875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.393087438741295, + "grad_norm": 0.7736281156539917, + "kl": 0.141845703125, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 272438549.0, + "reward": 1.3486608266830444, + "reward_std": 0.26764869689941406, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3575893044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3818052113056183, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 490.52679443359375, + "completions/mean_terminated_length": 490.52679443359375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.3941191643022957, + "grad_norm": 0.6975762248039246, + "kl": 0.14013671875, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 272556922.0, + "reward": 1.6674107313156128, + "reward_std": 0.24090106785297394, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6674107313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4099390208721161, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1051.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 472.044677734375, + "completions/mean_terminated_length": 472.044677734375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 2.3951508898632965, + "grad_norm": 0.7948324680328369, + "kl": 0.14599609375, + "learning_rate": 1e-06, + "loss": 0.0365, + "num_tokens": 272678230.0, + "reward": 1.4892858266830444, + "reward_std": 0.24986693263053894, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4892857074737549, + "rewards/curriculum_aware_reward_fn/std": 0.46728214621543884, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 532.8392944335938, + "completions/mean_terminated_length": 532.8392944335938, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 2.396182615424297, + "grad_norm": 0.7188726663589478, + "kl": 0.148681640625, + "learning_rate": 1e-06, + "loss": -0.0278, + "num_tokens": 272813424.0, + "reward": 1.5388394594192505, + "reward_std": 0.20557691156864166, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5388392806053162, + "rewards/curriculum_aware_reward_fn/std": 0.38954293727874756, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1163.0, + "completions/max_terminated_length": 1163.0, + "completions/mean_length": 501.26788330078125, + "completions/mean_terminated_length": 501.26788330078125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.3972143409852977, + "grad_norm": 0.8085273504257202, + "kl": 0.14697265625, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 272934204.0, + "reward": 1.4508929252624512, + "reward_std": 0.18216951191425323, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4508928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.40246883034706116, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1537.0, + "completions/mean_length": 575.857177734375, + "completions/mean_terminated_length": 544.1441650390625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 2.3982460665462986, + "grad_norm": 0.8094402551651001, + "kl": 0.143310546875, + "learning_rate": 1e-06, + "loss": -0.0251, + "num_tokens": 273070429.0, + "reward": 1.5200893878936768, + "reward_std": 0.19970080256462097, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5379464030265808, + "rewards/curriculum_aware_reward_fn/std": 0.3922242820262909, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2306.0, + "completions/max_terminated_length": 2306.0, + "completions/mean_length": 497.8660888671875, + "completions/mean_terminated_length": 497.8660888671875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 2.3992777921072994, + "grad_norm": 0.6123159527778625, + "kl": 0.140625, + "learning_rate": 1e-06, + "loss": 0.0331, + "num_tokens": 273186026.0, + "reward": 1.5879465341567993, + "reward_std": 0.1610545963048935, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5879464745521545, + "rewards/curriculum_aware_reward_fn/std": 0.4583187997341156, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1580.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 508.1607360839844, + "completions/mean_terminated_length": 508.1607360839844, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 2.4003095176683003, + "grad_norm": 0.6526829600334167, + "kl": 0.15283203125, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 273312177.0, + "reward": 1.6111607551574707, + "reward_std": 0.21387022733688354, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6111606955528259, + "rewards/curriculum_aware_reward_fn/std": 0.48887211084365845, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1584.0, + "completions/max_terminated_length": 1584.0, + "completions/mean_length": 497.1964416503906, + "completions/mean_terminated_length": 497.1964416503906, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 2.401341243229301, + "grad_norm": 0.7623741030693054, + "kl": 0.14404296875, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 273438612.0, + "reward": 1.440178632736206, + "reward_std": 0.14009803533554077, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44017860293388367, + "rewards/curriculum_aware_reward_fn/std": 0.4423951804637909, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1524.0, + "completions/max_terminated_length": 1524.0, + "completions/mean_length": 552.3392944335938, + "completions/mean_terminated_length": 552.3392944335938, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 2.402372968790302, + "grad_norm": 0.5391533374786377, + "kl": 0.1292724609375, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 273565707.0, + "reward": 1.4424108266830444, + "reward_std": 0.14713065326213837, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4424107074737549, + "rewards/curriculum_aware_reward_fn/std": 0.4474255442619324, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 467.5982360839844, + "completions/mean_terminated_length": 467.5982360839844, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 2.4034046943513028, + "grad_norm": 0.8083183169364929, + "kl": 0.15283203125, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 273684543.0, + "reward": 1.4950894117355347, + "reward_std": 0.20851272344589233, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.44377371668815613, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1367.0, + "completions/max_terminated_length": 1367.0, + "completions/mean_length": 527.875, + "completions/mean_terminated_length": 527.875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 2.404436419912303, + "grad_norm": 0.6930996775627136, + "kl": 0.137451171875, + "learning_rate": 1e-06, + "loss": 0.0285, + "num_tokens": 273816815.0, + "reward": 1.446874976158142, + "reward_std": 0.17862723767757416, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44687503576278687, + "rewards/curriculum_aware_reward_fn/std": 0.4424685835838318, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 544.3303833007812, + "completions/mean_terminated_length": 544.3303833007812, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 2.405468145473304, + "grad_norm": 0.7719082832336426, + "kl": 0.146728515625, + "learning_rate": 1e-06, + "loss": 0.0278, + "num_tokens": 273940647.0, + "reward": 1.516964316368103, + "reward_std": 0.20712628960609436, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774, + "rewards/curriculum_aware_reward_fn/std": 0.41340065002441406, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 968.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 493.83929443359375, + "completions/mean_terminated_length": 493.83929443359375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.406499871034305, + "grad_norm": 0.7389146685600281, + "kl": 0.150634765625, + "learning_rate": 1e-06, + "loss": 0.0144, + "num_tokens": 274066468.0, + "reward": 1.3535715341567993, + "reward_std": 0.18684786558151245, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.4170088768005371, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1510.0, + "completions/max_terminated_length": 1510.0, + "completions/mean_length": 531.75, + "completions/mean_terminated_length": 531.75, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 2.4075315965953057, + "grad_norm": 0.7130151987075806, + "kl": 0.130859375, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 274195058.0, + "reward": 1.3089287281036377, + "reward_std": 0.20015646517276764, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30892854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3642970621585846, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1272.0, + "completions/max_terminated_length": 1272.0, + "completions/mean_length": 506.4107360839844, + "completions/mean_terminated_length": 506.4107360839844, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 2.4085633221563065, + "grad_norm": 0.6497871279716492, + "kl": 0.14111328125, + "learning_rate": 1e-06, + "loss": -0.016, + "num_tokens": 274317590.0, + "reward": 1.6272321939468384, + "reward_std": 0.1722278743982315, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6272321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4044175148010254, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1132.0, + "completions/max_terminated_length": 1132.0, + "completions/mean_length": 571.9285888671875, + "completions/mean_terminated_length": 571.9285888671875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 2.4095950477173074, + "grad_norm": 0.6405318379402161, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 274453326.0, + "reward": 1.2607142925262451, + "reward_std": 0.12552452087402344, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2607142925262451, + "rewards/curriculum_aware_reward_fn/std": 0.3606443703174591, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2659.0, + "completions/max_terminated_length": 2659.0, + "completions/mean_length": 522.8035888671875, + "completions/mean_terminated_length": 522.8035888671875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 2.4106267732783078, + "grad_norm": 0.7238879203796387, + "kl": 0.143310546875, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 274580116.0, + "reward": 1.6142858266830444, + "reward_std": 0.26368361711502075, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6142857670783997, + "rewards/curriculum_aware_reward_fn/std": 0.4057263731956482, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1801.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 430.3035888671875, + "completions/mean_terminated_length": 430.3035888671875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.4116584988393086, + "grad_norm": 0.5872229933738708, + "kl": 0.13525390625, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 274694194.0, + "reward": 1.612053632736206, + "reward_std": 0.12714780867099762, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6120535731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4642268419265747, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 426.6160888671875, + "completions/mean_terminated_length": 426.6160888671875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.4126902244003094, + "grad_norm": 1.0934064388275146, + "kl": 0.226806640625, + "learning_rate": 1e-06, + "loss": 0.0316, + "num_tokens": 274797296.0, + "reward": 1.6227680444717407, + "reward_std": 0.1348063200712204, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6227678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.42695698142051697, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 474.107177734375, + "completions/mean_terminated_length": 474.107177734375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.4137219499613103, + "grad_norm": 0.7120774984359741, + "kl": 0.14697265625, + "learning_rate": 1e-06, + "loss": 0.0417, + "num_tokens": 274911786.0, + "reward": 1.5785715579986572, + "reward_std": 0.14783048629760742, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5785714387893677, + "rewards/curriculum_aware_reward_fn/std": 0.4151258170604706, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1504.0, + "completions/max_terminated_length": 1504.0, + "completions/mean_length": 450.5000305175781, + "completions/mean_terminated_length": 450.5000305175781, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 2.414753675522311, + "grad_norm": 0.8808677792549133, + "kl": 0.142333984375, + "learning_rate": 1e-06, + "loss": 0.045, + "num_tokens": 275023338.0, + "reward": 1.6892857551574707, + "reward_std": 0.17411798238754272, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6892856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.39217737317085266, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1393.0, + "completions/max_terminated_length": 1393.0, + "completions/mean_length": 496.9107360839844, + "completions/mean_terminated_length": 496.9107360839844, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 2.415785401083312, + "grad_norm": 0.7008398175239563, + "kl": 0.15185546875, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 275146059.0, + "reward": 1.4919644594192505, + "reward_std": 0.2472720891237259, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5098214149475098, + "rewards/curriculum_aware_reward_fn/std": 0.3980152904987335, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 497.482177734375, + "completions/mean_terminated_length": 497.482177734375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.416817126644313, + "grad_norm": 0.6380481123924255, + "kl": 0.1229248046875, + "learning_rate": 1e-06, + "loss": -0.0137, + "num_tokens": 275274962.0, + "reward": 1.4843751192092896, + "reward_std": 0.13236817717552185, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4933035373687744, + "rewards/curriculum_aware_reward_fn/std": 0.43782302737236023, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 472.9464416503906, + "completions/mean_terminated_length": 472.9464416503906, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.417848852205313, + "grad_norm": 0.7028351426124573, + "kl": 0.148193359375, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 275400914.0, + "reward": 1.4901787042617798, + "reward_std": 0.21455268561840057, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.49910715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.43319380283355713, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 474.2857360839844, + "completions/mean_terminated_length": 474.2857360839844, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.418880577766314, + "grad_norm": 0.6846729516983032, + "kl": 0.14599609375, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 275521360.0, + "reward": 1.5718752145767212, + "reward_std": 0.1859087347984314, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5718749761581421, + "rewards/curriculum_aware_reward_fn/std": 0.4332353174686432, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 459.6250305175781, + "completions/mean_terminated_length": 459.6250305175781, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 2.419912303327315, + "grad_norm": 0.8312680125236511, + "kl": 0.185791015625, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 275645547.0, + "reward": 1.5767858028411865, + "reward_std": 0.18420013785362244, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.576785683631897, + "rewards/curriculum_aware_reward_fn/std": 0.5562069416046143, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 474.4464416503906, + "completions/mean_terminated_length": 474.4464416503906, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 2.4209440288883157, + "grad_norm": 0.8140915632247925, + "kl": 0.13916015625, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 275772962.0, + "reward": 1.4232144355773926, + "reward_std": 0.2549702823162079, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43214288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.40346792340278625, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1507.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 471.232177734375, + "completions/mean_terminated_length": 471.232177734375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 2.4219757544493166, + "grad_norm": 0.6817914247512817, + "kl": 0.1494140625, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 275900150.0, + "reward": 1.4977679252624512, + "reward_std": 0.1634839028120041, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4977678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.40838003158569336, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1616.0, + "completions/max_terminated_length": 1616.0, + "completions/mean_length": 480.5535888671875, + "completions/mean_terminated_length": 480.5535888671875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.4230074800103174, + "grad_norm": 0.6364054083824158, + "kl": 0.140625, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 276021613.0, + "reward": 1.5656249523162842, + "reward_std": 0.11939293891191483, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.565625011920929, + "rewards/curriculum_aware_reward_fn/std": 0.43687212467193604, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 446.9285888671875, + "completions/mean_terminated_length": 446.9285888671875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 2.424039205571318, + "grad_norm": 0.9955868124961853, + "kl": 0.195556640625, + "learning_rate": 1e-06, + "loss": 0.0341, + "num_tokens": 276141830.0, + "reward": 1.5102678537368774, + "reward_std": 0.16214899718761444, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5102678537368774, + "rewards/curriculum_aware_reward_fn/std": 0.41601601243019104, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 500.83929443359375, + "completions/mean_terminated_length": 500.83929443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.4250709311323186, + "grad_norm": 0.7112178206443787, + "kl": 0.1337890625, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 276268239.0, + "reward": 1.4861608743667603, + "reward_std": 0.22006341814994812, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4861607253551483, + "rewards/curriculum_aware_reward_fn/std": 0.44006600975990295, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1270.0, + "completions/max_terminated_length": 1270.0, + "completions/mean_length": 524.5892944335938, + "completions/mean_terminated_length": 524.5892944335938, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 2.4261026566933195, + "grad_norm": 0.7407634258270264, + "kl": 0.13525390625, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 276390525.0, + "reward": 1.5602679252624512, + "reward_std": 0.28421685099601746, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5602678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.48985832929611206, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1226.0, + "completions/max_terminated_length": 1226.0, + "completions/mean_length": 465.4375305175781, + "completions/mean_terminated_length": 465.4375305175781, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.4271343822543203, + "grad_norm": 0.6548599600791931, + "kl": 0.1337890625, + "learning_rate": 1e-06, + "loss": 0.0255, + "num_tokens": 276516308.0, + "reward": 1.5254465341567993, + "reward_std": 0.2117275446653366, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098, + "rewards/curriculum_aware_reward_fn/std": 0.452769011259079, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1437.0, + "completions/max_terminated_length": 1437.0, + "completions/mean_length": 563.3482666015625, + "completions/mean_terminated_length": 563.3482666015625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 2.428166107815321, + "grad_norm": 0.724729597568512, + "kl": 0.1287841796875, + "learning_rate": 1e-06, + "loss": 0.034, + "num_tokens": 276665258.0, + "reward": 1.4772323369979858, + "reward_std": 0.23840339481830597, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4772321581840515, + "rewards/curriculum_aware_reward_fn/std": 0.43495306372642517, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1079.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 487.169677734375, + "completions/mean_terminated_length": 487.169677734375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 2.429197833376322, + "grad_norm": 0.7468309998512268, + "kl": 0.14453125, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 276783444.0, + "reward": 1.4058037996292114, + "reward_std": 0.1380435973405838, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4058035910129547, + "rewards/curriculum_aware_reward_fn/std": 0.35853466391563416, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1146.0, + "completions/max_terminated_length": 1146.0, + "completions/mean_length": 523.4017944335938, + "completions/mean_terminated_length": 523.4017944335938, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 2.430229558937323, + "grad_norm": 35.98688507080078, + "kl": 0.153076171875, + "learning_rate": 1e-06, + "loss": -0.0095, + "num_tokens": 276910938.0, + "reward": 1.502232313156128, + "reward_std": 0.25507819652557373, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5200892686843872, + "rewards/curriculum_aware_reward_fn/std": 0.45788684487342834, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 433.4285888671875, + "completions/mean_terminated_length": 433.4285888671875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 2.4312612844983237, + "grad_norm": 0.7806984782218933, + "kl": 0.143310546875, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 277024078.0, + "reward": 1.6334822177886963, + "reward_std": 0.18449801206588745, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6334820985794067, + "rewards/curriculum_aware_reward_fn/std": 0.42903560400009155, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 435.3482360839844, + "completions/mean_terminated_length": 435.3482360839844, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 2.432293010059324, + "grad_norm": 0.7690462470054626, + "kl": 0.187255859375, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 277130160.0, + "reward": 1.557142972946167, + "reward_std": 0.16556496918201447, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5660714507102966, + "rewards/curriculum_aware_reward_fn/std": 0.43827131390571594, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 449.65179443359375, + "completions/mean_terminated_length": 449.65179443359375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 2.433324735620325, + "grad_norm": 0.721041738986969, + "kl": 0.14208984375, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 277247707.0, + "reward": 1.5343750715255737, + "reward_std": 0.22333721816539764, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5433036088943481, + "rewards/curriculum_aware_reward_fn/std": 0.5206326246261597, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1342.0, + "completions/max_terminated_length": 1342.0, + "completions/mean_length": 486.45538330078125, + "completions/mean_terminated_length": 486.45538330078125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 2.4343564611813258, + "grad_norm": 0.7117599248886108, + "kl": 0.1356201171875, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 277373856.0, + "reward": 1.5066964626312256, + "reward_std": 0.18413867056369781, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5066964030265808, + "rewards/curriculum_aware_reward_fn/std": 0.4238123595714569, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1196.0, + "completions/max_terminated_length": 1196.0, + "completions/mean_length": 497.33929443359375, + "completions/mean_terminated_length": 497.33929443359375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 2.4353881867423266, + "grad_norm": 0.7181190252304077, + "kl": 0.14306640625, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 277501133.0, + "reward": 1.4875000715255737, + "reward_std": 0.22479689121246338, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.49642857909202576, + "rewards/curriculum_aware_reward_fn/std": 0.47309190034866333, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 494.2589416503906, + "completions/mean_terminated_length": 494.2589416503906, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 2.4364199123033274, + "grad_norm": 0.6397473216056824, + "kl": 0.1258544921875, + "learning_rate": 1e-06, + "loss": 0.038, + "num_tokens": 277615290.0, + "reward": 1.5732144117355347, + "reward_std": 0.1664787083864212, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5732142329216003, + "rewards/curriculum_aware_reward_fn/std": 0.4036393165588379, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1095.0, + "completions/max_terminated_length": 1095.0, + "completions/mean_length": 514.8660888671875, + "completions/mean_terminated_length": 514.8660888671875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 2.4374516378643283, + "grad_norm": 0.6321114301681519, + "kl": 0.1474609375, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 277741996.0, + "reward": 1.4080358743667603, + "reward_std": 0.15648870170116425, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41696426272392273, + "rewards/curriculum_aware_reward_fn/std": 0.41457420587539673, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2378.0, + "completions/max_terminated_length": 2378.0, + "completions/mean_length": 497.26788330078125, + "completions/mean_terminated_length": 497.26788330078125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.4384833634253287, + "grad_norm": 0.6805658340454102, + "kl": 0.140869140625, + "learning_rate": 1e-06, + "loss": 0.0307, + "num_tokens": 277872051.0, + "reward": 1.441517949104309, + "reward_std": 0.14166250824928284, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.45044639706611633, + "rewards/curriculum_aware_reward_fn/std": 0.46501457691192627, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1578.0, + "completions/mean_length": 484.419677734375, + "completions/mean_terminated_length": 451.8829040527344, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 2.4395150889863295, + "grad_norm": 0.6962588429450989, + "kl": 0.1448974609375, + "learning_rate": 1e-06, + "loss": 0.0474, + "num_tokens": 277987206.0, + "reward": 1.579464316368103, + "reward_std": 0.22141891717910767, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5883928537368774, + "rewards/curriculum_aware_reward_fn/std": 0.4113799035549164, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1429.0, + "completions/max_terminated_length": 1429.0, + "completions/mean_length": 465.5982360839844, + "completions/mean_terminated_length": 465.5982360839844, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.4405468145473304, + "grad_norm": 0.6728730201721191, + "kl": 0.135009765625, + "learning_rate": 1e-06, + "loss": 0.015, + "num_tokens": 278115009.0, + "reward": 1.516964316368103, + "reward_std": 0.1518820971250534, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.516964316368103, + "rewards/curriculum_aware_reward_fn/std": 0.4160926043987274, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 465.0089416503906, + "completions/mean_terminated_length": 465.0089416503906, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.441578540108331, + "grad_norm": 0.6624149084091187, + "kl": 0.144287109375, + "learning_rate": 1e-06, + "loss": 0.0286, + "num_tokens": 278229061.0, + "reward": 1.6191965341567993, + "reward_std": 0.20879162847995758, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.628125011920929, + "rewards/curriculum_aware_reward_fn/std": 0.40519049763679504, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 486.76788330078125, + "completions/mean_terminated_length": 486.76788330078125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 2.442610265669332, + "grad_norm": 0.8079473376274109, + "kl": 0.131591796875, + "learning_rate": 1e-06, + "loss": -0.0174, + "num_tokens": 278355721.0, + "reward": 1.419196605682373, + "reward_std": 0.20018258690834045, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4281249940395355, + "rewards/curriculum_aware_reward_fn/std": 0.40424442291259766, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 489.20538330078125, + "completions/mean_terminated_length": 489.20538330078125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 2.443641991230333, + "grad_norm": 0.7336907386779785, + "kl": 0.138671875, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 278486189.0, + "reward": 1.458035945892334, + "reward_std": 0.1761673539876938, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4758928418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4279630184173584, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 445.9375305175781, + "completions/mean_terminated_length": 445.9375305175781, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.4446737167913337, + "grad_norm": 0.582846462726593, + "kl": 0.1337890625, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 278600639.0, + "reward": 1.51160728931427, + "reward_std": 0.11233289539813995, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5205357670783997, + "rewards/curriculum_aware_reward_fn/std": 0.475553423166275, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1211.0, + "completions/max_terminated_length": 1211.0, + "completions/mean_length": 478.08929443359375, + "completions/mean_terminated_length": 478.08929443359375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 2.445705442352334, + "grad_norm": 0.7559561133384705, + "kl": 0.1390380859375, + "learning_rate": 1e-06, + "loss": 0.0368, + "num_tokens": 278729491.0, + "reward": 1.567410945892334, + "reward_std": 0.19153565168380737, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5674106478691101, + "rewards/curriculum_aware_reward_fn/std": 0.3921360671520233, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1568.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 461.4285888671875, + "completions/mean_terminated_length": 461.4285888671875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.446737167913335, + "grad_norm": 0.6146445274353027, + "kl": 0.141357421875, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 278847293.0, + "reward": 1.6013394594192505, + "reward_std": 0.2132793515920639, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.6191964149475098, + "rewards/curriculum_aware_reward_fn/std": 0.427555650472641, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1367.0, + "completions/max_terminated_length": 1367.0, + "completions/mean_length": 500.2500305175781, + "completions/mean_terminated_length": 500.2500305175781, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 2.447768893474336, + "grad_norm": 0.7753989100456238, + "kl": 0.135498046875, + "learning_rate": 1e-06, + "loss": 0.0414, + "num_tokens": 278971613.0, + "reward": 1.417410969734192, + "reward_std": 0.2362671196460724, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3869282007217407, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 443.6875305175781, + "completions/mean_terminated_length": 443.6875305175781, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 2.4488006190353366, + "grad_norm": 0.715697169303894, + "kl": 0.137451171875, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 279092664.0, + "reward": 1.5709823369979858, + "reward_std": 0.1610252857208252, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5709820985794067, + "rewards/curriculum_aware_reward_fn/std": 0.43103888630867004, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 417.5000305175781, + "completions/mean_terminated_length": 417.5000305175781, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 2.4498323445963375, + "grad_norm": 0.753842294216156, + "kl": 0.13525390625, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 279208038.0, + "reward": 1.3986607789993286, + "reward_std": 0.10415496677160263, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.39866071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.410199910402298, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1408.0, + "completions/max_terminated_length": 1408.0, + "completions/mean_length": 425.1339416503906, + "completions/mean_terminated_length": 425.1339416503906, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 2.4508640701573383, + "grad_norm": 0.8698515892028809, + "kl": 0.14306640625, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 279316308.0, + "reward": 1.5468751192092896, + "reward_std": 0.24297165870666504, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5558035969734192, + "rewards/curriculum_aware_reward_fn/std": 0.42509910464286804, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 442.8750305175781, + "completions/mean_terminated_length": 442.8750305175781, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.4518957957183387, + "grad_norm": 0.8830894231796265, + "kl": 0.142578125, + "learning_rate": 1e-06, + "loss": 0.0269, + "num_tokens": 279423877.0, + "reward": 1.4602681398391724, + "reward_std": 0.24288131296634674, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4691964089870453, + "rewards/curriculum_aware_reward_fn/std": 0.4479304552078247, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 447.3482360839844, + "completions/mean_terminated_length": 447.3482360839844, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 2.4529275212793396, + "grad_norm": 0.8079304099082947, + "kl": 0.155029296875, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 279536189.0, + "reward": 1.4299107789993286, + "reward_std": 0.21381406486034393, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43883928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.38035720586776733, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 421.3571472167969, + "completions/mean_terminated_length": 421.3571472167969, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 2.4539592468403404, + "grad_norm": 0.7308657169342041, + "kl": 0.14208984375, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 279644204.0, + "reward": 1.6513394117355347, + "reward_std": 0.11506469547748566, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6513392329216003, + "rewards/curriculum_aware_reward_fn/std": 0.4083288013935089, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1154.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 445.1339416503906, + "completions/mean_terminated_length": 445.1339416503906, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 2.4549909724013412, + "grad_norm": 0.7689999938011169, + "kl": 0.13427734375, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 279760293.0, + "reward": 1.5821430683135986, + "reward_std": 0.2600085437297821, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5821428298950195, + "rewards/curriculum_aware_reward_fn/std": 0.447659432888031, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 417.7589416503906, + "completions/mean_terminated_length": 417.7589416503906, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.456022697962342, + "grad_norm": 0.8615774512290955, + "kl": 0.135009765625, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 279873498.0, + "reward": 1.5763394832611084, + "reward_std": 0.3193800449371338, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5763392448425293, + "rewards/curriculum_aware_reward_fn/std": 0.41388946771621704, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 427.1696472167969, + "completions/mean_terminated_length": 427.1696472167969, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.457054423523343, + "grad_norm": 0.7411530017852783, + "kl": 0.1309814453125, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 279988275.0, + "reward": 1.589732050895691, + "reward_std": 0.26775115728378296, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5897321701049805, + "rewards/curriculum_aware_reward_fn/std": 0.44418859481811523, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 375.2321472167969, + "completions/mean_terminated_length": 375.2321472167969, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 2.4580861490843438, + "grad_norm": 1.1352235078811646, + "kl": 0.1357421875, + "learning_rate": 1e-06, + "loss": 0.0148, + "num_tokens": 280086980.0, + "reward": 1.5687501430511475, + "reward_std": 0.12106120586395264, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5776785612106323, + "rewards/curriculum_aware_reward_fn/std": 0.44331061840057373, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1090.0, + "completions/max_terminated_length": 1090.0, + "completions/mean_length": 432.3035888671875, + "completions/mean_terminated_length": 432.3035888671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 2.459117874645344, + "grad_norm": 0.8991947174072266, + "kl": 0.1484375, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 280205440.0, + "reward": 1.6343752145767212, + "reward_std": 0.2517082095146179, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6343750357627869, + "rewards/curriculum_aware_reward_fn/std": 0.46075883507728577, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 425.6339416503906, + "completions/mean_terminated_length": 425.6339416503906, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 2.460149600206345, + "grad_norm": 0.8231422901153564, + "kl": 0.145751953125, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 280310224.0, + "reward": 1.403571605682373, + "reward_std": 0.18607930839061737, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4035714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.4063524007797241, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 449.0535888671875, + "completions/mean_terminated_length": 449.0535888671875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 2.461181325767346, + "grad_norm": 0.6670386791229248, + "kl": 0.1282958984375, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 280424904.0, + "reward": 1.4799107313156128, + "reward_std": 0.14945851266384125, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4799107015132904, + "rewards/curriculum_aware_reward_fn/std": 0.454529732465744, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 454.0714416503906, + "completions/mean_terminated_length": 454.0714416503906, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.4622130513283467, + "grad_norm": 0.8334097862243652, + "kl": 0.130126953125, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 280545515.0, + "reward": 1.5950894355773926, + "reward_std": 0.2628384232521057, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.595089316368103, + "rewards/curriculum_aware_reward_fn/std": 0.3990393877029419, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 413.95538330078125, + "completions/mean_terminated_length": 413.95538330078125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 2.4632447768893475, + "grad_norm": 0.7344523668289185, + "kl": 0.14404296875, + "learning_rate": 1e-06, + "loss": 0.0226, + "num_tokens": 280668934.0, + "reward": 1.611607313156128, + "reward_std": 0.16572576761245728, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6205357313156128, + "rewards/curriculum_aware_reward_fn/std": 0.3946462571620941, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 449.8839416503906, + "completions/mean_terminated_length": 449.8839416503906, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.4642765024503483, + "grad_norm": 0.7916790246963501, + "kl": 0.13818359375, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 280786059.0, + "reward": 1.4486607313156128, + "reward_std": 0.20076420903205872, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4486607015132904, + "rewards/curriculum_aware_reward_fn/std": 0.4619513750076294, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 442.3750305175781, + "completions/mean_terminated_length": 442.3750305175781, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 2.4653082280113487, + "grad_norm": 0.8291255831718445, + "kl": 0.14404296875, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 280893929.0, + "reward": 1.583482265472412, + "reward_std": 0.2577275037765503, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.592410683631897, + "rewards/curriculum_aware_reward_fn/std": 0.41078969836235046, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 421.89288330078125, + "completions/mean_terminated_length": 421.89288330078125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.4663399535723496, + "grad_norm": 1.1254520416259766, + "kl": 0.280517578125, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 281002788.0, + "reward": 1.4928573369979858, + "reward_std": 0.20928914844989777, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4928571879863739, + "rewards/curriculum_aware_reward_fn/std": 0.41230276226997375, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 432.3035888671875, + "completions/mean_terminated_length": 432.3035888671875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.4673716791333504, + "grad_norm": 0.7715088129043579, + "kl": 0.136962890625, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 281116936.0, + "reward": 1.5491071939468384, + "reward_std": 0.2484361082315445, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5491071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.42170727252960205, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 440.95538330078125, + "completions/mean_terminated_length": 440.95538330078125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.4684034046943513, + "grad_norm": 0.8398274779319763, + "kl": 0.14990234375, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 281242924.0, + "reward": 1.518303632736206, + "reward_std": 0.2738878130912781, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5272321701049805, + "rewards/curriculum_aware_reward_fn/std": 0.4173537790775299, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1224.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 455.232177734375, + "completions/mean_terminated_length": 455.232177734375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 2.469435130255352, + "grad_norm": 0.9072141647338867, + "kl": 0.1353759765625, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 281359521.0, + "reward": 1.5566965341567993, + "reward_std": 0.32403483986854553, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5745536088943481, + "rewards/curriculum_aware_reward_fn/std": 0.433090478181839, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1093.0, + "completions/max_terminated_length": 1093.0, + "completions/mean_length": 489.46429443359375, + "completions/mean_terminated_length": 489.46429443359375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.470466855816353, + "grad_norm": 0.7404239177703857, + "kl": 0.125732421875, + "learning_rate": 1e-06, + "loss": -0.0239, + "num_tokens": 281496549.0, + "reward": 1.5584824085235596, + "reward_std": 0.21812358498573303, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5584821105003357, + "rewards/curriculum_aware_reward_fn/std": 0.44209206104278564, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1078.0, + "completions/max_terminated_length": 1078.0, + "completions/mean_length": 431.4107360839844, + "completions/mean_terminated_length": 431.4107360839844, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.471498581377354, + "grad_norm": 0.7665273547172546, + "kl": 0.13427734375, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 281613684.0, + "reward": 1.3875001668930054, + "reward_std": 0.22730781137943268, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38750001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.4318147301673889, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 445.9285888671875, + "completions/mean_terminated_length": 445.9285888671875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 2.4725303069383546, + "grad_norm": 0.8466961979866028, + "kl": 0.140625, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 281728119.0, + "reward": 1.653571605682373, + "reward_std": 0.248878613114357, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.6714285612106323, + "rewards/curriculum_aware_reward_fn/std": 0.6378000974655151, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 439.6339416503906, + "completions/mean_terminated_length": 439.6339416503906, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 2.473562032499355, + "grad_norm": 0.7907005548477173, + "kl": 0.1319580078125, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 281841170.0, + "reward": 1.4718750715255737, + "reward_std": 0.2342146635055542, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.48080354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.4226815402507782, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 465.857177734375, + "completions/mean_terminated_length": 465.857177734375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 2.474593758060356, + "grad_norm": 0.8809310793876648, + "kl": 0.13330078125, + "learning_rate": 1e-06, + "loss": -0.019, + "num_tokens": 281958124.0, + "reward": 1.5223214626312256, + "reward_std": 0.2313743233680725, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.53125, + "rewards/curriculum_aware_reward_fn/std": 0.4156360924243927, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 432.08929443359375, + "completions/mean_terminated_length": 432.08929443359375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 2.4756254836213567, + "grad_norm": 0.7881631851196289, + "kl": 0.13671875, + "learning_rate": 1e-06, + "loss": -0.0186, + "num_tokens": 282073634.0, + "reward": 1.4223215579986572, + "reward_std": 0.14511245489120483, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4312500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.4221419394016266, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 484.9732360839844, + "completions/mean_terminated_length": 484.9732360839844, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.4766572091823575, + "grad_norm": 0.6421279907226562, + "kl": 0.123291015625, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 282194528.0, + "reward": 1.438839316368103, + "reward_std": 0.1716233789920807, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.45669645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.42661961913108826, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 426.3035888671875, + "completions/mean_terminated_length": 426.3035888671875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.4776889347433584, + "grad_norm": 0.9702169299125671, + "kl": 0.143310546875, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 282310111.0, + "reward": 1.5482145547866821, + "reward_std": 0.2584778964519501, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5660713911056519, + "rewards/curriculum_aware_reward_fn/std": 0.42358478903770447, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 476.4107360839844, + "completions/mean_terminated_length": 476.4107360839844, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.4787206603043592, + "grad_norm": 0.7732918858528137, + "kl": 0.127197265625, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 282427785.0, + "reward": 1.4455358982086182, + "reward_std": 0.1627698689699173, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.43653804063796997, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 429.01788330078125, + "completions/mean_terminated_length": 429.01788330078125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.4797523858653596, + "grad_norm": 0.8249169588088989, + "kl": 0.17626953125, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 282535895.0, + "reward": 1.6513392925262451, + "reward_std": 0.1300283670425415, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6513392329216003, + "rewards/curriculum_aware_reward_fn/std": 0.5273364782333374, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1156.0, + "completions/max_terminated_length": 1156.0, + "completions/mean_length": 444.544677734375, + "completions/mean_terminated_length": 444.544677734375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 2.4807841114263605, + "grad_norm": 0.9494808912277222, + "kl": 0.152099609375, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 282659443.0, + "reward": 1.4933037757873535, + "reward_std": 0.28490185737609863, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4933035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.3912612795829773, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 430.3571472167969, + "completions/mean_terminated_length": 430.3571472167969, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 2.4818158369873613, + "grad_norm": 0.8510450720787048, + "kl": 0.13623046875, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 282770594.0, + "reward": 1.5924108028411865, + "reward_std": 0.29261523485183716, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6013392806053162, + "rewards/curriculum_aware_reward_fn/std": 0.4421721398830414, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 467.9910888671875, + "completions/mean_terminated_length": 467.9910888671875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 2.482847562548362, + "grad_norm": 0.6033523082733154, + "kl": 0.1334228515625, + "learning_rate": 1e-06, + "loss": -0.0172, + "num_tokens": 282887346.0, + "reward": 1.3660714626312256, + "reward_std": 0.18522292375564575, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3660714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.5164507627487183, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 450.232177734375, + "completions/mean_terminated_length": 450.232177734375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.483879288109363, + "grad_norm": 0.789675235748291, + "kl": 0.138916015625, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 283001570.0, + "reward": 1.443750023841858, + "reward_std": 0.2680908441543579, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4437499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.43924200534820557, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1272.0, + "completions/max_terminated_length": 1272.0, + "completions/mean_length": 467.9107360839844, + "completions/mean_terminated_length": 467.9107360839844, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 2.484911013670364, + "grad_norm": 0.7835040092468262, + "kl": 0.138427734375, + "learning_rate": 1e-06, + "loss": 0.0191, + "num_tokens": 283118418.0, + "reward": 1.403571605682373, + "reward_std": 0.20980599522590637, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41249996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.38651248812675476, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 451.2500305175781, + "completions/mean_terminated_length": 451.2500305175781, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 2.4859427392313647, + "grad_norm": 0.8104711174964905, + "kl": 0.1455078125, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 283243452.0, + "reward": 1.5383929014205933, + "reward_std": 0.20198899507522583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5383928418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4044227600097656, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1491.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 418.6964416503906, + "completions/mean_terminated_length": 418.6964416503906, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 2.486974464792365, + "grad_norm": 0.808778703212738, + "kl": 0.152099609375, + "learning_rate": 1e-06, + "loss": 0.0456, + "num_tokens": 283359730.0, + "reward": 1.5857144594192505, + "reward_std": 0.2007414996623993, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5857142806053162, + "rewards/curriculum_aware_reward_fn/std": 0.44440609216690063, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 427.5089416503906, + "completions/mean_terminated_length": 427.5089416503906, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 2.488006190353366, + "grad_norm": 0.7132419347763062, + "kl": 0.1376953125, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 283475462.0, + "reward": 1.547767996788025, + "reward_std": 0.1488623172044754, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5477678179740906, + "rewards/curriculum_aware_reward_fn/std": 0.4024356007575989, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1545.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 437.08038330078125, + "completions/mean_terminated_length": 437.08038330078125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.4890379159143667, + "grad_norm": 0.8942745327949524, + "kl": 0.151611328125, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 283592777.0, + "reward": 1.534821629524231, + "reward_std": 0.21150675415992737, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.543749988079071, + "rewards/curriculum_aware_reward_fn/std": 0.4239520728588104, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 995.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 461.1964416503906, + "completions/mean_terminated_length": 461.1964416503906, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 2.4900696414753676, + "grad_norm": 0.8201280832290649, + "kl": 0.1416015625, + "learning_rate": 1e-06, + "loss": 0.0483, + "num_tokens": 283717882.0, + "reward": 1.479017972946167, + "reward_std": 0.2597033679485321, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.49687501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.3952009975910187, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 419.0535888671875, + "completions/mean_terminated_length": 419.0535888671875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.4911013670363684, + "grad_norm": 0.7921744585037231, + "kl": 0.14111328125, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 283833731.0, + "reward": 1.5223214626312256, + "reward_std": 0.21495002508163452, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.53125, + "rewards/curriculum_aware_reward_fn/std": 0.43262892961502075, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 458.232177734375, + "completions/mean_terminated_length": 458.232177734375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 2.4921330925973693, + "grad_norm": 0.921675443649292, + "kl": 0.17626953125, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 283951714.0, + "reward": 1.4861608743667603, + "reward_std": 0.32043954730033875, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.5129464268684387, + "rewards/curriculum_aware_reward_fn/std": 0.41713014245033264, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 422.0714416503906, + "completions/mean_terminated_length": 422.0714416503906, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 2.4931648181583697, + "grad_norm": 0.8260616660118103, + "kl": 0.156982421875, + "learning_rate": 1e-06, + "loss": 0.0261, + "num_tokens": 284070827.0, + "reward": 1.5308037996292114, + "reward_std": 0.16464564204216003, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5308035612106323, + "rewards/curriculum_aware_reward_fn/std": 0.3940003514289856, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2222.0, + "completions/max_terminated_length": 2222.0, + "completions/mean_length": 461.3660888671875, + "completions/mean_terminated_length": 461.3660888671875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.4941965437193705, + "grad_norm": 0.6988282203674316, + "kl": 0.14208984375, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 284192528.0, + "reward": 1.4892858266830444, + "reward_std": 0.21458344161510468, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4982142746448517, + "rewards/curriculum_aware_reward_fn/std": 0.42223817110061646, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 429.95538330078125, + "completions/mean_terminated_length": 429.95538330078125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 2.4952282692803713, + "grad_norm": 0.8364533185958862, + "kl": 0.149169921875, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 284310942.0, + "reward": 1.4254463911056519, + "reward_std": 0.2122143805027008, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.43437501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.37296178936958313, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 425.7321472167969, + "completions/mean_terminated_length": 425.7321472167969, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.496259994841372, + "grad_norm": 0.8200624585151672, + "kl": 0.158447265625, + "learning_rate": 1e-06, + "loss": 0.0135, + "num_tokens": 284422436.0, + "reward": 1.5343750715255737, + "reward_std": 0.2781384289264679, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5433035492897034, + "rewards/curriculum_aware_reward_fn/std": 0.49408644437789917, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1269.0, + "completions/max_terminated_length": 1269.0, + "completions/mean_length": 376.5982360839844, + "completions/mean_terminated_length": 376.5982360839844, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 2.497291720402373, + "grad_norm": 0.8670206665992737, + "kl": 0.1552734375, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 284521266.0, + "reward": 1.6232143640518188, + "reward_std": 0.2606101334095001, + "rewards/code_format_reward/mean": 0.9642857313156128, + "rewards/code_format_reward/std": 0.18641091883182526, + "rewards/curriculum_aware_reward_fn/mean": 0.6589285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.40859097242355347, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 907.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 427.2321472167969, + "completions/mean_terminated_length": 427.2321472167969, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 2.498323445963374, + "grad_norm": 0.7723448276519775, + "kl": 0.130859375, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 284639788.0, + "reward": 1.6361607313156128, + "reward_std": 0.214374378323555, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6361607313156128, + "rewards/curriculum_aware_reward_fn/std": 0.3943982720375061, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 492.1607360839844, + "completions/mean_terminated_length": 492.1607360839844, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.4993551715243747, + "grad_norm": 0.7836470007896423, + "kl": 0.144287109375, + "learning_rate": 1e-06, + "loss": -0.0216, + "num_tokens": 284772645.0, + "reward": 1.3093751668930054, + "reward_std": 0.20374557375907898, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.31830358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.4010681211948395, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 397.4375305175781, + "completions/mean_terminated_length": 397.4375305175781, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.5003868970853755, + "grad_norm": 0.9490916728973389, + "kl": 0.1904296875, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 284877295.0, + "reward": 1.6299108266830444, + "reward_std": 0.20785073935985565, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6299107670783997, + "rewards/curriculum_aware_reward_fn/std": 0.39589446783065796, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1001.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 440.0089416503906, + "completions/mean_terminated_length": 440.0089416503906, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.501418622646376, + "grad_norm": 0.8236626982688904, + "kl": 0.145751953125, + "learning_rate": 1e-06, + "loss": -0.0189, + "num_tokens": 284993398.0, + "reward": 1.4593751430511475, + "reward_std": 0.22319842875003815, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4593749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.41814321279525757, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 398.21429443359375, + "completions/mean_terminated_length": 398.21429443359375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.5024503482073768, + "grad_norm": 0.7493698596954346, + "kl": 0.166259765625, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 285108005.0, + "reward": 1.4629465341567993, + "reward_std": 0.14567653834819794, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.47187498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.42879369854927063, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1229.0, + "completions/max_terminated_length": 1229.0, + "completions/mean_length": 446.044677734375, + "completions/mean_terminated_length": 446.044677734375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.5034820737683776, + "grad_norm": 0.6787101030349731, + "kl": 0.140869140625, + "learning_rate": 1e-06, + "loss": -0.0293, + "num_tokens": 285226982.0, + "reward": 1.5388394594192505, + "reward_std": 0.16813233494758606, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5388392806053162, + "rewards/curriculum_aware_reward_fn/std": 0.3883848488330841, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 415.8660888671875, + "completions/mean_terminated_length": 415.8660888671875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 2.5045137993293785, + "grad_norm": 0.7325301170349121, + "kl": 0.13818359375, + "learning_rate": 1e-06, + "loss": 0.0158, + "num_tokens": 285345491.0, + "reward": 1.6705358028411865, + "reward_std": 0.1813599020242691, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6794642806053162, + "rewards/curriculum_aware_reward_fn/std": 0.37534090876579285, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 425.9285888671875, + "completions/mean_terminated_length": 425.9285888671875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 2.5055455248903793, + "grad_norm": 0.8319756388664246, + "kl": 0.15673828125, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 285452198.0, + "reward": 1.4526787996292114, + "reward_std": 0.22211024165153503, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45267853140830994, + "rewards/curriculum_aware_reward_fn/std": 0.4158373177051544, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 481.1875305175781, + "completions/mean_terminated_length": 481.1875305175781, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 2.5065772504513797, + "grad_norm": 0.8352397680282593, + "kl": 0.150146484375, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 285573018.0, + "reward": 1.540178656578064, + "reward_std": 0.23180150985717773, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5401785969734192, + "rewards/curriculum_aware_reward_fn/std": 0.40206894278526306, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 447.5625305175781, + "completions/mean_terminated_length": 447.5625305175781, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 2.5076089760123805, + "grad_norm": 0.7682133913040161, + "kl": 0.15185546875, + "learning_rate": 1e-06, + "loss": -0.0143, + "num_tokens": 285694041.0, + "reward": 1.69910728931427, + "reward_std": 0.17765921354293823, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6991071701049805, + "rewards/curriculum_aware_reward_fn/std": 0.3511873185634613, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 389.15179443359375, + "completions/mean_terminated_length": 389.15179443359375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 2.5086407015733814, + "grad_norm": 0.8312105536460876, + "kl": 0.150390625, + "learning_rate": 1e-06, + "loss": -0.0124, + "num_tokens": 285799996.0, + "reward": 1.6107144355773926, + "reward_std": 0.2674295902252197, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.610714316368103, + "rewards/curriculum_aware_reward_fn/std": 0.40260574221611023, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 460.5714416503906, + "completions/mean_terminated_length": 460.5714416503906, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 2.509672427134382, + "grad_norm": 0.7323674559593201, + "kl": 0.161376953125, + "learning_rate": 1e-06, + "loss": 0.0303, + "num_tokens": 285923855.0, + "reward": 1.4986608028411865, + "reward_std": 0.24598772823810577, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49866071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4470856785774231, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1230.0, + "completions/max_terminated_length": 1230.0, + "completions/mean_length": 438.83929443359375, + "completions/mean_terminated_length": 438.83929443359375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 2.510704152695383, + "grad_norm": 0.7977431416511536, + "kl": 0.148681640625, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 286039414.0, + "reward": 1.3580358028411865, + "reward_std": 0.14513644576072693, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35803571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.40794771909713745, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 446.044677734375, + "completions/mean_terminated_length": 446.044677734375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.511735878256384, + "grad_norm": 0.7356109619140625, + "kl": 0.137451171875, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 286156312.0, + "reward": 1.5607144832611084, + "reward_std": 0.2145209014415741, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5696428418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4123300611972809, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1300.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 472.4375305175781, + "completions/mean_terminated_length": 472.4375305175781, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.5127676038173847, + "grad_norm": 0.8715661764144897, + "kl": 0.1484375, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 286283831.0, + "reward": 1.4593751430511475, + "reward_std": 0.21618635952472687, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4683035910129547, + "rewards/curriculum_aware_reward_fn/std": 0.36440718173980713, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 444.3750305175781, + "completions/mean_terminated_length": 444.3750305175781, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 2.5137993293783856, + "grad_norm": 0.7075579166412354, + "kl": 0.14892578125, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 286396454.0, + "reward": 1.4535716772079468, + "reward_std": 0.16791139543056488, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.46250003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.40711575746536255, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3690.0, + "completions/max_terminated_length": 3690.0, + "completions/mean_length": 547.419677734375, + "completions/mean_terminated_length": 547.419677734375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 2.514831054939386, + "grad_norm": 0.8053109645843506, + "kl": 0.146240234375, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 286530754.0, + "reward": 1.5142858028411865, + "reward_std": 0.3455066978931427, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.514285683631897, + "rewards/curriculum_aware_reward_fn/std": 0.4575900733470917, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1186.0, + "completions/max_terminated_length": 1186.0, + "completions/mean_length": 494.3482360839844, + "completions/mean_terminated_length": 494.3482360839844, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 2.515862780500387, + "grad_norm": 0.6410834789276123, + "kl": 0.15087890625, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 286652124.0, + "reward": 1.3754465579986572, + "reward_std": 0.16579985618591309, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3754464089870453, + "rewards/curriculum_aware_reward_fn/std": 0.4305347502231598, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 907.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 452.7589416503906, + "completions/mean_terminated_length": 452.7589416503906, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 2.5168945060613876, + "grad_norm": 0.6556716561317444, + "kl": 0.145263671875, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 286772861.0, + "reward": 1.5330358743667603, + "reward_std": 0.14051130414009094, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5330356955528259, + "rewards/curriculum_aware_reward_fn/std": 0.43597015738487244, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 463.982177734375, + "completions/mean_terminated_length": 463.982177734375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 2.5179262316223885, + "grad_norm": 0.6537541747093201, + "kl": 0.1416015625, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 286884243.0, + "reward": 1.6959822177886963, + "reward_std": 0.15117120742797852, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6959820985794067, + "rewards/curriculum_aware_reward_fn/std": 0.4043200612068176, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 482.7232360839844, + "completions/mean_terminated_length": 482.7232360839844, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 2.5189579571833893, + "grad_norm": 0.688506543636322, + "kl": 0.156494140625, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 287000055.0, + "reward": 1.5665180683135986, + "reward_std": 0.17180171608924866, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5665178894996643, + "rewards/curriculum_aware_reward_fn/std": 0.36020442843437195, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1895.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 468.77679443359375, + "completions/mean_terminated_length": 468.77679443359375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 2.5199896827443897, + "grad_norm": 0.8272270560264587, + "kl": 0.14013671875, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 287124553.0, + "reward": 1.5071429014205933, + "reward_std": 0.18399137258529663, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5071428418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4011165201663971, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 479.6785888671875, + "completions/mean_terminated_length": 479.6785888671875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 2.5210214083053906, + "grad_norm": 0.7374287843704224, + "kl": 0.159912109375, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 287249040.0, + "reward": 1.4357144832611084, + "reward_std": 0.23039209842681885, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4357143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3941822648048401, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1153.0, + "completions/max_terminated_length": 1153.0, + "completions/mean_length": 440.1160888671875, + "completions/mean_terminated_length": 440.1160888671875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 2.5220531338663914, + "grad_norm": 0.833993136882782, + "kl": 0.154296875, + "learning_rate": 1e-06, + "loss": 0.0338, + "num_tokens": 287356440.0, + "reward": 1.5147321224212646, + "reward_std": 0.230870321393013, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5147321224212646, + "rewards/curriculum_aware_reward_fn/std": 0.3973809778690338, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2434.0, + "completions/max_terminated_length": 2434.0, + "completions/mean_length": 415.95538330078125, + "completions/mean_terminated_length": 415.95538330078125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.5230848594273922, + "grad_norm": 0.8010690808296204, + "kl": 0.16357421875, + "learning_rate": 1e-06, + "loss": 0.0324, + "num_tokens": 287465558.0, + "reward": 1.6500000953674316, + "reward_std": 0.14846742153167725, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6500000357627869, + "rewards/curriculum_aware_reward_fn/std": 0.4071987271308899, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 407.4910888671875, + "completions/mean_terminated_length": 407.4910888671875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.524116584988393, + "grad_norm": 0.8469009399414062, + "kl": 0.158935546875, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 287577354.0, + "reward": 1.6169644594192505, + "reward_std": 0.24044549465179443, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6169642806053162, + "rewards/curriculum_aware_reward_fn/std": 0.37425923347473145, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1300.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 460.5714416503906, + "completions/mean_terminated_length": 460.5714416503906, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.525148310549394, + "grad_norm": 0.8326296210289001, + "kl": 0.15771484375, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 287696033.0, + "reward": 1.5566965341567993, + "reward_std": 0.2956428527832031, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5566964149475098, + "rewards/curriculum_aware_reward_fn/std": 0.44364863634109497, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 452.2500305175781, + "completions/mean_terminated_length": 452.2500305175781, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 2.5261800361103948, + "grad_norm": 0.8186579942703247, + "kl": 0.154052734375, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 287826422.0, + "reward": 1.4031251668930054, + "reward_std": 0.2201891839504242, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40312501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.39775729179382324, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 439.58038330078125, + "completions/mean_terminated_length": 439.58038330078125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.5272117616713956, + "grad_norm": 0.8152475357055664, + "kl": 0.16064453125, + "learning_rate": 1e-06, + "loss": 0.0321, + "num_tokens": 287944185.0, + "reward": 1.3558037281036377, + "reward_std": 0.22809594869613647, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35580354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.35966357588768005, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1347.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 466.9464416503906, + "completions/mean_terminated_length": 466.9464416503906, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.5282434872323964, + "grad_norm": 0.8093869090080261, + "kl": 0.15185546875, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 288064782.0, + "reward": 1.4241071939468384, + "reward_std": 0.21188300848007202, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4241071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4358372986316681, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1476.0, + "completions/max_terminated_length": 1476.0, + "completions/mean_length": 466.7500305175781, + "completions/mean_terminated_length": 466.7500305175781, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 2.529275212793397, + "grad_norm": 0.6856550574302673, + "kl": 0.16357421875, + "learning_rate": 1e-06, + "loss": 0.0227, + "num_tokens": 288187863.0, + "reward": 1.5281250476837158, + "reward_std": 0.2091558426618576, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5370535850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4404515326023102, + "step": 2451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 426.2232360839844, + "completions/mean_terminated_length": 426.2232360839844, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 2.5303069383543977, + "grad_norm": 0.8464824557304382, + "kl": 0.15869140625, + "learning_rate": 1e-06, + "loss": -0.0258, + "num_tokens": 288296920.0, + "reward": 1.6477679014205933, + "reward_std": 0.1869218498468399, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6477679014205933, + "rewards/curriculum_aware_reward_fn/std": 0.39257684350013733, + "step": 2452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1098.0, + "completions/max_terminated_length": 1098.0, + "completions/mean_length": 461.7500305175781, + "completions/mean_terminated_length": 461.7500305175781, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.5313386639153985, + "grad_norm": 0.6828495860099792, + "kl": 0.148681640625, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 288412826.0, + "reward": 1.6468751430511475, + "reward_std": 0.12923133373260498, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6468750238418579, + "rewards/curriculum_aware_reward_fn/std": 0.40265342593193054, + "step": 2453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1999.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 492.77679443359375, + "completions/mean_terminated_length": 492.77679443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.5323703894763994, + "grad_norm": 0.7749906182289124, + "kl": 0.17529296875, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 288540418.0, + "reward": 1.4218751192092896, + "reward_std": 0.22093766927719116, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4308035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.40354543924331665, + "step": 2454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 995.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 439.0089416503906, + "completions/mean_terminated_length": 439.0089416503906, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.5334021150374, + "grad_norm": 0.6202346682548523, + "kl": 0.15771484375, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 288646217.0, + "reward": 1.4843751192092896, + "reward_std": 0.17100587487220764, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.484375, + "rewards/curriculum_aware_reward_fn/std": 0.445600688457489, + "step": 2455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 475.4732360839844, + "completions/mean_terminated_length": 475.4732360839844, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.5344338405984006, + "grad_norm": 0.7288236618041992, + "kl": 0.156982421875, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 288773341.0, + "reward": 1.487946629524231, + "reward_std": 0.21048535406589508, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48794645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.4141770005226135, + "step": 2456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1653.0, + "completions/max_terminated_length": 1653.0, + "completions/mean_length": 464.51788330078125, + "completions/mean_terminated_length": 464.51788330078125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.5354655661594014, + "grad_norm": 0.8047718405723572, + "kl": 0.157470703125, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 288888570.0, + "reward": 1.5767858028411865, + "reward_std": 0.2637113332748413, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5857143402099609, + "rewards/curriculum_aware_reward_fn/std": 0.45798367261886597, + "step": 2457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2133.0, + "completions/max_terminated_length": 2133.0, + "completions/mean_length": 588.8125, + "completions/mean_terminated_length": 588.8125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 2.5364972917204023, + "grad_norm": 0.7194903492927551, + "kl": 0.14111328125, + "learning_rate": 1e-06, + "loss": 0.009, + "num_tokens": 289035444.0, + "reward": 1.3660714626312256, + "reward_std": 0.18908150494098663, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3660714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.34296101331710815, + "step": 2458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1347.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 534.1964721679688, + "completions/mean_terminated_length": 534.1964721679688, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 2.537529017281403, + "grad_norm": 0.7994199991226196, + "kl": 0.1484375, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 289162278.0, + "reward": 1.3794645071029663, + "reward_std": 0.20744940638542175, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3883928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3257688283920288, + "step": 2459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2228.0, + "completions/max_terminated_length": 2228.0, + "completions/mean_length": 562.669677734375, + "completions/mean_terminated_length": 562.669677734375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.538560742842404, + "grad_norm": 0.7126930356025696, + "kl": 0.1357421875, + "learning_rate": 1e-06, + "loss": 0.054, + "num_tokens": 289293904.0, + "reward": 1.5178571939468384, + "reward_std": 0.2616358697414398, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5267857313156128, + "rewards/curriculum_aware_reward_fn/std": 0.42170441150665283, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2335.0, + "completions/max_terminated_length": 2335.0, + "completions/mean_length": 581.3482666015625, + "completions/mean_terminated_length": 581.3482666015625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.539592468403405, + "grad_norm": 0.709905207157135, + "kl": 0.136962890625, + "learning_rate": 1e-06, + "loss": -0.0371, + "num_tokens": 289430240.0, + "reward": 1.3718750476837158, + "reward_std": 0.21493136882781982, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.37187501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.4054127335548401, + "step": 2461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2754.0, + "completions/max_terminated_length": 2754.0, + "completions/mean_length": 504.64288330078125, + "completions/mean_terminated_length": 504.64288330078125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 2.5406241939644056, + "grad_norm": 0.7318965792655945, + "kl": 0.162109375, + "learning_rate": 1e-06, + "loss": -0.0258, + "num_tokens": 289556102.0, + "reward": 1.6785714626312256, + "reward_std": 0.1515929102897644, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6785714030265808, + "rewards/curriculum_aware_reward_fn/std": 0.5129280686378479, + "step": 2462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1980.0, + "completions/max_terminated_length": 1980.0, + "completions/mean_length": 463.5535888671875, + "completions/mean_terminated_length": 463.5535888671875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 2.5416559195254065, + "grad_norm": 0.7726377248764038, + "kl": 0.161376953125, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 289671160.0, + "reward": 1.6642857789993286, + "reward_std": 0.2374168336391449, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6642857193946838, + "rewards/curriculum_aware_reward_fn/std": 0.3770695626735687, + "step": 2463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1769.0, + "completions/mean_length": 530.0803833007812, + "completions/mean_terminated_length": 497.9549560546875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 2.542687645086407, + "grad_norm": 0.7113441228866577, + "kl": 0.16357421875, + "learning_rate": 1e-06, + "loss": 0.0104, + "num_tokens": 289794257.0, + "reward": 1.593750238418579, + "reward_std": 0.1650165319442749, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.59375, + "rewards/curriculum_aware_reward_fn/std": 0.430698424577713, + "step": 2464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 452.4732360839844, + "completions/mean_terminated_length": 452.4732360839844, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.5437193706474077, + "grad_norm": 0.6848057508468628, + "kl": 0.156494140625, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 289916425.0, + "reward": 1.5254465341567993, + "reward_std": 0.1454419493675232, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.534375011920929, + "rewards/curriculum_aware_reward_fn/std": 0.43521925806999207, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 517.5267944335938, + "completions/mean_terminated_length": 517.5267944335938, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.5447510962084086, + "grad_norm": 0.7695538401603699, + "kl": 0.15869140625, + "learning_rate": 1e-06, + "loss": -0.0267, + "num_tokens": 290044710.0, + "reward": 1.4553571939468384, + "reward_std": 0.1838531196117401, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4642857015132904, + "rewards/curriculum_aware_reward_fn/std": 0.40981364250183105, + "step": 2466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2111.0, + "completions/max_terminated_length": 2111.0, + "completions/mean_length": 508.46429443359375, + "completions/mean_terminated_length": 508.46429443359375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 2.5457828217694094, + "grad_norm": 0.7199175357818604, + "kl": 0.166015625, + "learning_rate": 1e-06, + "loss": 0.0456, + "num_tokens": 290165421.0, + "reward": 1.6058037281036377, + "reward_std": 0.17861010134220123, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6147321462631226, + "rewards/curriculum_aware_reward_fn/std": 0.42673271894454956, + "step": 2467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1183.0, + "completions/max_terminated_length": 1183.0, + "completions/mean_length": 485.08929443359375, + "completions/mean_terminated_length": 485.08929443359375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 2.5468145473304102, + "grad_norm": 0.7515586614608765, + "kl": 0.159423828125, + "learning_rate": 1e-06, + "loss": -0.0235, + "num_tokens": 290293594.0, + "reward": 1.5303571224212646, + "reward_std": 0.21597933769226074, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5303571820259094, + "rewards/curriculum_aware_reward_fn/std": 0.41440051794052124, + "step": 2468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2193.0, + "completions/max_terminated_length": 2193.0, + "completions/mean_length": 581.732177734375, + "completions/mean_terminated_length": 581.732177734375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 2.5478462728914106, + "grad_norm": 0.7689985632896423, + "kl": 0.166015625, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 290426466.0, + "reward": 1.4830358028411865, + "reward_std": 0.2173272669315338, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48303574323654175, + "rewards/curriculum_aware_reward_fn/std": 0.3862241208553314, + "step": 2469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2914.0, + "completions/max_terminated_length": 2914.0, + "completions/mean_length": 592.9107666015625, + "completions/mean_terminated_length": 592.9107666015625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 2.5488779984524115, + "grad_norm": 0.5767408013343811, + "kl": 0.160888671875, + "learning_rate": 1e-06, + "loss": 0.0269, + "num_tokens": 290551273.0, + "reward": 1.3696428537368774, + "reward_std": 0.172710120677948, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36964288353919983, + "rewards/curriculum_aware_reward_fn/std": 0.4238581657409668, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 651.8482666015625, + "completions/mean_terminated_length": 589.2272338867188, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.5499097240134123, + "grad_norm": 0.6692609190940857, + "kl": 0.16162109375, + "learning_rate": 1e-06, + "loss": 0.0472, + "num_tokens": 290694525.0, + "reward": 1.5656250715255737, + "reward_std": 0.19344080984592438, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5745536088943481, + "rewards/curriculum_aware_reward_fn/std": 0.41611650586128235, + "step": 2471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3462.0, + "completions/mean_length": 654.8392944335938, + "completions/mean_terminated_length": 623.8378295898438, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 2.550941449574413, + "grad_norm": 0.6685264706611633, + "kl": 0.1474609375, + "learning_rate": 1e-06, + "loss": 0.0319, + "num_tokens": 290833800.0, + "reward": 1.5674108266830444, + "reward_std": 0.24880172312259674, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5763393044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3816724717617035, + "step": 2472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3203.0, + "completions/mean_length": 808.3660888671875, + "completions/mean_terminated_length": 686.6018676757812, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.551973175135414, + "grad_norm": 0.5768068432807922, + "kl": 0.14697265625, + "learning_rate": 1e-06, + "loss": 0.0611, + "num_tokens": 290988506.0, + "reward": 1.541517972946167, + "reward_std": 0.15940067172050476, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774, + "rewards/curriculum_aware_reward_fn/std": 0.40374869108200073, + "step": 2473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3218.0, + "completions/mean_length": 705.7767944335938, + "completions/mean_terminated_length": 675.2342529296875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.553004900696415, + "grad_norm": 0.6619222164154053, + "kl": 0.154052734375, + "learning_rate": 1e-06, + "loss": 0.0367, + "num_tokens": 291135173.0, + "reward": 1.5513393878936768, + "reward_std": 0.21687711775302887, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5513392686843872, + "rewards/curriculum_aware_reward_fn/std": 0.42178618907928467, + "step": 2474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3998.0, + "completions/mean_length": 719.6875610351562, + "completions/mean_terminated_length": 689.270263671875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 2.5540366262574157, + "grad_norm": 0.6049193739891052, + "kl": 0.142578125, + "learning_rate": 1e-06, + "loss": 0.0594, + "num_tokens": 291278225.0, + "reward": 1.6330357789993286, + "reward_std": 0.23981758952140808, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6330356597900391, + "rewards/curriculum_aware_reward_fn/std": 0.4030599892139435, + "step": 2475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3030.0, + "completions/mean_length": 745.1160888671875, + "completions/mean_terminated_length": 714.9279174804688, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 2.5550683518184165, + "grad_norm": 0.5797421336174011, + "kl": 0.145263671875, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 291426267.0, + "reward": 1.4638394117355347, + "reward_std": 0.2388923615217209, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4727678894996643, + "rewards/curriculum_aware_reward_fn/std": 0.4605562686920166, + "step": 2476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3957.0, + "completions/max_terminated_length": 3957.0, + "completions/mean_length": 685.1517944335938, + "completions/mean_terminated_length": 685.1517944335938, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 2.556100077379417, + "grad_norm": 0.664794921875, + "kl": 0.151611328125, + "learning_rate": 1e-06, + "loss": -0.0832, + "num_tokens": 291573347.0, + "reward": 1.4888393878936768, + "reward_std": 0.2121625393629074, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4888392984867096, + "rewards/curriculum_aware_reward_fn/std": 0.34755972027778625, + "step": 2477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3135.0, + "completions/mean_length": 831.982177734375, + "completions/mean_terminated_length": 802.5765991210938, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 2.5571318029404178, + "grad_norm": 0.530683696269989, + "kl": 0.136962890625, + "learning_rate": 1e-06, + "loss": 0.0293, + "num_tokens": 291733806.0, + "reward": 1.433035969734192, + "reward_std": 0.2383945882320404, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4419642984867096, + "rewards/curriculum_aware_reward_fn/std": 0.43233877420425415, + "step": 2478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3562.0, + "completions/max_terminated_length": 3562.0, + "completions/mean_length": 703.5267944335938, + "completions/mean_terminated_length": 703.5267944335938, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.5581635285014186, + "grad_norm": 0.5770664215087891, + "kl": 0.153564453125, + "learning_rate": 1e-06, + "loss": -0.0261, + "num_tokens": 291882573.0, + "reward": 1.5433037281036377, + "reward_std": 0.21655043959617615, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5433036088943481, + "rewards/curriculum_aware_reward_fn/std": 0.4411030113697052, + "step": 2479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3648.0, + "completions/mean_length": 762.1607666015625, + "completions/mean_terminated_length": 732.1261596679688, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 2.5591952540624194, + "grad_norm": 0.6695989370346069, + "kl": 0.157470703125, + "learning_rate": 1e-06, + "loss": -0.073, + "num_tokens": 292040721.0, + "reward": 1.4433035850524902, + "reward_std": 0.288400262594223, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.47008928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.39154690504074097, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3680.0, + "completions/max_terminated_length": 3680.0, + "completions/mean_length": 700.5089721679688, + "completions/mean_terminated_length": 700.5089721679688, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.5602269796234203, + "grad_norm": 0.5907813906669617, + "kl": 0.138671875, + "learning_rate": 1e-06, + "loss": 0.0907, + "num_tokens": 292186910.0, + "reward": 1.35535728931427, + "reward_std": 0.2883957028388977, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.3732143044471741, + "rewards/curriculum_aware_reward_fn/std": 0.40201789140701294, + "step": 2481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3300.0, + "completions/max_terminated_length": 3300.0, + "completions/mean_length": 612.6875, + "completions/mean_terminated_length": 612.6875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.5612587051844207, + "grad_norm": 0.6172129511833191, + "kl": 0.143310546875, + "learning_rate": 1e-06, + "loss": 0.0363, + "num_tokens": 292317453.0, + "reward": 1.38660728931427, + "reward_std": 0.11005396395921707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3866071403026581, + "rewards/curriculum_aware_reward_fn/std": 0.3735019564628601, + "step": 2482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3715.0, + "completions/mean_length": 775.5982666015625, + "completions/mean_terminated_length": 745.6846923828125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 2.5622904307454215, + "grad_norm": 0.5745029449462891, + "kl": 0.141357421875, + "learning_rate": 1e-06, + "loss": 0.0298, + "num_tokens": 292478247.0, + "reward": 1.450446605682373, + "reward_std": 0.24547609686851501, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45044639706611633, + "rewards/curriculum_aware_reward_fn/std": 0.35320258140563965, + "step": 2483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3417.0, + "completions/max_terminated_length": 3417.0, + "completions/mean_length": 662.4642944335938, + "completions/mean_terminated_length": 662.4642944335938, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 2.5633221563064224, + "grad_norm": 0.6197950839996338, + "kl": 0.14501953125, + "learning_rate": 1e-06, + "loss": -0.0226, + "num_tokens": 292615800.0, + "reward": 1.4772323369979858, + "reward_std": 0.2510889768600464, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.4198862612247467, + "step": 2484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2696.0, + "completions/mean_length": 764.8035888671875, + "completions/mean_terminated_length": 704.236328125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.564353881867423, + "grad_norm": 0.6195743083953857, + "kl": 0.138916015625, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 292764185.0, + "reward": 1.4839287996292114, + "reward_std": 0.17358790338039398, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48392853140830994, + "rewards/curriculum_aware_reward_fn/std": 0.4302845299243927, + "step": 2485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3639.0, + "completions/mean_length": 695.0892944335938, + "completions/mean_terminated_length": 664.450439453125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 2.565385607428424, + "grad_norm": 0.6343938112258911, + "kl": 0.1455078125, + "learning_rate": 1e-06, + "loss": 0.0215, + "num_tokens": 292909837.0, + "reward": 1.516517996788025, + "reward_std": 0.2180844098329544, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098, + "rewards/curriculum_aware_reward_fn/std": 0.38554954528808594, + "step": 2486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1761.0, + "completions/mean_length": 628.9107666015625, + "completions/mean_terminated_length": 597.6757202148438, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.566417332989425, + "grad_norm": 0.6394612193107605, + "kl": 0.146484375, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 293047174.0, + "reward": 1.5357145071029663, + "reward_std": 0.3192685842514038, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.4290902018547058, + "step": 2487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2599.0, + "completions/mean_length": 652.607177734375, + "completions/mean_terminated_length": 621.5855712890625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.5674490585504257, + "grad_norm": 0.7118006348609924, + "kl": 0.155029296875, + "learning_rate": 1e-06, + "loss": 0.0636, + "num_tokens": 293186388.0, + "reward": 1.5379464626312256, + "reward_std": 0.2745712697505951, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5379464030265808, + "rewards/curriculum_aware_reward_fn/std": 0.4722118675708771, + "step": 2488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1626.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 582.7678833007812, + "completions/mean_terminated_length": 582.7678833007812, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 2.5684807841114266, + "grad_norm": 0.7305005192756653, + "kl": 0.150390625, + "learning_rate": 1e-06, + "loss": 0.0279, + "num_tokens": 293318242.0, + "reward": 1.3830358982086182, + "reward_std": 0.20811490714550018, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.39196428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4081133008003235, + "step": 2489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1658.0, + "completions/max_terminated_length": 1658.0, + "completions/mean_length": 516.3928833007812, + "completions/mean_terminated_length": 516.3928833007812, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 2.5695125096724274, + "grad_norm": 0.6877014636993408, + "kl": 0.151123046875, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 293442949.0, + "reward": 1.517857313156128, + "reward_std": 0.17956021428108215, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5267857313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4478139281272888, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2395.0, + "completions/max_terminated_length": 2395.0, + "completions/mean_length": 593.1785888671875, + "completions/mean_terminated_length": 593.1785888671875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.570544235233428, + "grad_norm": 0.7326036691665649, + "kl": 0.140869140625, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 293576907.0, + "reward": 1.5151785612106323, + "reward_std": 0.20150600373744965, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5151785612106323, + "rewards/curriculum_aware_reward_fn/std": 0.44112056493759155, + "step": 2491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2668.0, + "completions/max_terminated_length": 2668.0, + "completions/mean_length": 655.4642944335938, + "completions/mean_terminated_length": 655.4642944335938, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.5715759607944286, + "grad_norm": 0.6929550766944885, + "kl": 0.137451171875, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 293719731.0, + "reward": 1.5000001192092896, + "reward_std": 0.22161515057086945, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5, + "rewards/curriculum_aware_reward_fn/std": 0.42511260509490967, + "step": 2492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3589.0, + "completions/mean_length": 609.0892944335938, + "completions/mean_terminated_length": 577.6756591796875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 2.5726076863554295, + "grad_norm": 0.5321632623672485, + "kl": 0.1278076171875, + "learning_rate": 1e-06, + "loss": -0.0153, + "num_tokens": 293849129.0, + "reward": 1.5357143878936768, + "reward_std": 0.14974889159202576, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.428302139043808, + "step": 2493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1772.0, + "completions/max_terminated_length": 1772.0, + "completions/mean_length": 537.8482666015625, + "completions/mean_terminated_length": 537.8482666015625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.5736394119164303, + "grad_norm": 0.7619335651397705, + "kl": 0.14697265625, + "learning_rate": 1e-06, + "loss": -0.0658, + "num_tokens": 293969281.0, + "reward": 1.4763394594192505, + "reward_std": 0.21331331133842468, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47633928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.430794358253479, + "step": 2494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2394.0, + "completions/max_terminated_length": 2394.0, + "completions/mean_length": 540.107177734375, + "completions/mean_terminated_length": 540.107177734375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.574671137477431, + "grad_norm": 0.548896074295044, + "kl": 0.14501953125, + "learning_rate": 1e-06, + "loss": 0.0275, + "num_tokens": 294092948.0, + "reward": 1.4799107313156128, + "reward_std": 0.14194467663764954, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4799107015132904, + "rewards/curriculum_aware_reward_fn/std": 0.4888605773448944, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1709.0, + "completions/mean_length": 569.7232666015625, + "completions/mean_terminated_length": 537.9549560546875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 2.5757028630384315, + "grad_norm": 0.6730985641479492, + "kl": 0.1494140625, + "learning_rate": 1e-06, + "loss": -0.042, + "num_tokens": 294223376.0, + "reward": 1.5250000953674316, + "reward_std": 0.1675071120262146, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5250000357627869, + "rewards/curriculum_aware_reward_fn/std": 0.43933814764022827, + "step": 2496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2365.0, + "completions/mean_length": 538.5089721679688, + "completions/mean_terminated_length": 506.45947265625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 2.5767345885994324, + "grad_norm": 0.7262006402015686, + "kl": 0.150390625, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 294352324.0, + "reward": 1.4763394594192505, + "reward_std": 0.15129299461841583, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47633931040763855, + "rewards/curriculum_aware_reward_fn/std": 0.41589802503585815, + "step": 2497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2190.0, + "completions/max_terminated_length": 2190.0, + "completions/mean_length": 520.3660888671875, + "completions/mean_terminated_length": 520.3660888671875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.5777663141604332, + "grad_norm": 0.6421841382980347, + "kl": 0.1376953125, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 294478512.0, + "reward": 1.708035945892334, + "reward_std": 0.20250266790390015, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7080357670783997, + "rewards/curriculum_aware_reward_fn/std": 0.5012845396995544, + "step": 2498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3100.0, + "completions/max_terminated_length": 3100.0, + "completions/mean_length": 635.857177734375, + "completions/mean_terminated_length": 635.857177734375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 2.578798039721434, + "grad_norm": 0.6643877625465393, + "kl": 0.142333984375, + "learning_rate": 1e-06, + "loss": -0.0229, + "num_tokens": 294619238.0, + "reward": 1.4223215579986572, + "reward_std": 0.20365062355995178, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4401785731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4161621928215027, + "step": 2499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3352.0, + "completions/max_terminated_length": 3352.0, + "completions/mean_length": 602.0178833007812, + "completions/mean_terminated_length": 602.0178833007812, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 2.579829765282435, + "grad_norm": 0.5811689496040344, + "kl": 0.143310546875, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 294755719.0, + "reward": 1.3437501192092896, + "reward_std": 0.15117493271827698, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34375, + "rewards/curriculum_aware_reward_fn/std": 0.41357186436653137, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2637.0, + "completions/max_terminated_length": 2637.0, + "completions/mean_length": 622.9910888671875, + "completions/mean_terminated_length": 622.9910888671875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 2.5808614908434357, + "grad_norm": 0.7609834671020508, + "kl": 0.146484375, + "learning_rate": 1e-06, + "loss": -0.0487, + "num_tokens": 294884238.0, + "reward": 1.6107144355773926, + "reward_std": 0.2443053424358368, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.610714316368103, + "rewards/curriculum_aware_reward_fn/std": 0.41678890585899353, + "step": 2501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2255.0, + "completions/max_terminated_length": 2255.0, + "completions/mean_length": 649.2678833007812, + "completions/mean_terminated_length": 649.2678833007812, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.5818932164044366, + "grad_norm": 0.5464531183242798, + "kl": 0.144287109375, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 295032479.0, + "reward": 1.6593750715255737, + "reward_std": 0.20846553146839142, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.659375011920929, + "rewards/curriculum_aware_reward_fn/std": 0.48923230171203613, + "step": 2502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1999.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 578.4732666015625, + "completions/mean_terminated_length": 578.4732666015625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.5829249419654374, + "grad_norm": 0.7156299948692322, + "kl": 0.1494140625, + "learning_rate": 1e-06, + "loss": -0.0057, + "num_tokens": 295170887.0, + "reward": 1.5062501430511475, + "reward_std": 0.22756358981132507, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5062500238418579, + "rewards/curriculum_aware_reward_fn/std": 0.42288821935653687, + "step": 2503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3690.0, + "completions/max_terminated_length": 3690.0, + "completions/mean_length": 775.9910888671875, + "completions/mean_terminated_length": 775.9910888671875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.583956667526438, + "grad_norm": 0.5931575298309326, + "kl": 0.1270751953125, + "learning_rate": 1e-06, + "loss": 0.0924, + "num_tokens": 295333818.0, + "reward": 1.3321430683135986, + "reward_std": 0.22101646661758423, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3410714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.35004594922065735, + "step": 2504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2471.0, + "completions/mean_length": 748.794677734375, + "completions/mean_terminated_length": 687.9363403320312, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 2.5849883930874387, + "grad_norm": 0.6465186476707458, + "kl": 0.1312255859375, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 295493220.0, + "reward": 1.3633930683135986, + "reward_std": 0.2086215764284134, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36339282989501953, + "rewards/curriculum_aware_reward_fn/std": 0.468992680311203, + "step": 2505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1713.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 576.919677734375, + "completions/mean_terminated_length": 576.919677734375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 2.5860201186484395, + "grad_norm": 0.6617150902748108, + "kl": 0.138916015625, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 295626477.0, + "reward": 1.6214287281036377, + "reward_std": 0.24607469141483307, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6214286088943481, + "rewards/curriculum_aware_reward_fn/std": 0.5417153835296631, + "step": 2506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2568.0, + "completions/max_terminated_length": 2568.0, + "completions/mean_length": 541.5267944335938, + "completions/mean_terminated_length": 541.5267944335938, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 2.5870518442094403, + "grad_norm": 0.6159235239028931, + "kl": 0.141357421875, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 295751087.0, + "reward": 1.4861608743667603, + "reward_std": 0.16843244433403015, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.49508926272392273, + "rewards/curriculum_aware_reward_fn/std": 0.4519207775592804, + "step": 2507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3286.0, + "completions/max_terminated_length": 3286.0, + "completions/mean_length": 628.2589721679688, + "completions/mean_terminated_length": 628.2589721679688, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.588083569770441, + "grad_norm": 0.6114004254341125, + "kl": 0.144775390625, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 295886202.0, + "reward": 1.3754465579986572, + "reward_std": 0.17992451786994934, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.3933035731315613, + "rewards/curriculum_aware_reward_fn/std": 0.39045456051826477, + "step": 2508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2216.0, + "completions/max_terminated_length": 2216.0, + "completions/mean_length": 498.1875305175781, + "completions/mean_terminated_length": 498.1875305175781, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 2.5891152953314416, + "grad_norm": 0.7495975494384766, + "kl": 0.16357421875, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 296007624.0, + "reward": 1.6892858743667603, + "reward_std": 0.1631748378276825, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6892856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.5288279056549072, + "step": 2509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2493.0, + "completions/max_terminated_length": 2493.0, + "completions/mean_length": 536.2232666015625, + "completions/mean_terminated_length": 536.2232666015625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.5901470208924424, + "grad_norm": 0.8427813649177551, + "kl": 0.13330078125, + "learning_rate": 1e-06, + "loss": 0.0517, + "num_tokens": 296133411.0, + "reward": 1.5415178537368774, + "reward_std": 0.25848016142845154, + "rewards/code_format_reward/mean": 0.9553571343421936, + "rewards/code_format_reward/std": 0.2074466347694397, + "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838, + "rewards/curriculum_aware_reward_fn/std": 0.45590272545814514, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1591.0, + "completions/mean_length": 683.0803833007812, + "completions/mean_terminated_length": 621.0272827148438, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 2.5911787464534433, + "grad_norm": 0.6702219843864441, + "kl": 0.12060546875, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 296274449.0, + "reward": 1.5794644355773926, + "reward_std": 0.25675874948501587, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.579464316368103, + "rewards/curriculum_aware_reward_fn/std": 0.41868269443511963, + "step": 2511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3516.0, + "completions/max_terminated_length": 3516.0, + "completions/mean_length": 546.6160888671875, + "completions/mean_terminated_length": 546.6160888671875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.592210472014444, + "grad_norm": 0.690642237663269, + "kl": 0.136474609375, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 296393994.0, + "reward": 1.532142996788025, + "reward_std": 0.1899871975183487, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.550000011920929, + "rewards/curriculum_aware_reward_fn/std": 0.4154125154018402, + "step": 2512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2391.0, + "completions/max_terminated_length": 2391.0, + "completions/mean_length": 629.4732666015625, + "completions/mean_terminated_length": 629.4732666015625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 2.593242197575445, + "grad_norm": 0.7589452266693115, + "kl": 0.14111328125, + "learning_rate": 1e-06, + "loss": -0.0108, + "num_tokens": 296531667.0, + "reward": 1.415178656578064, + "reward_std": 0.23947352170944214, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4151785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.4060266613960266, + "step": 2513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1762.0, + "completions/max_terminated_length": 1762.0, + "completions/mean_length": 581.8482666015625, + "completions/mean_terminated_length": 581.8482666015625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 2.594273923136446, + "grad_norm": 0.6457692384719849, + "kl": 0.137451171875, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 296664573.0, + "reward": 1.4245537519454956, + "reward_std": 0.15253514051437378, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42455360293388367, + "rewards/curriculum_aware_reward_fn/std": 0.4164952039718628, + "step": 2514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3557.0, + "completions/max_terminated_length": 3557.0, + "completions/mean_length": 605.0535888671875, + "completions/mean_terminated_length": 605.0535888671875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.5953056486974466, + "grad_norm": 0.7411686778068542, + "kl": 0.1356201171875, + "learning_rate": 1e-06, + "loss": 0.0455, + "num_tokens": 296801758.0, + "reward": 1.4861607551574707, + "reward_std": 0.18960067629814148, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4861607253551483, + "rewards/curriculum_aware_reward_fn/std": 0.35133013129234314, + "step": 2515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2271.0, + "completions/max_terminated_length": 2271.0, + "completions/mean_length": 551.9285888671875, + "completions/mean_terminated_length": 551.9285888671875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 2.5963373742584475, + "grad_norm": 0.6617128849029541, + "kl": 0.13623046875, + "learning_rate": 1e-06, + "loss": 0.0369, + "num_tokens": 296921534.0, + "reward": 1.532142996788025, + "reward_std": 0.16609406471252441, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5321428179740906, + "rewards/curriculum_aware_reward_fn/std": 0.43974077701568604, + "step": 2516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1540.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 578.3660888671875, + "completions/mean_terminated_length": 578.3660888671875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 2.5973690998194483, + "grad_norm": 0.6656718850135803, + "kl": 0.1278076171875, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 297058908.0, + "reward": 1.3535715341567993, + "reward_std": 0.27912285923957825, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35357141494750977, + "rewards/curriculum_aware_reward_fn/std": 0.4194437861442566, + "step": 2517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2031.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 581.7589721679688, + "completions/mean_terminated_length": 581.7589721679688, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.5984008253804487, + "grad_norm": 0.6276634931564331, + "kl": 0.12451171875, + "learning_rate": 1e-06, + "loss": -0.0349, + "num_tokens": 297194562.0, + "reward": 1.614732265472412, + "reward_std": 0.2275889366865158, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6236607432365417, + "rewards/curriculum_aware_reward_fn/std": 0.38208532333374023, + "step": 2518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1959.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 557.75, + "completions/mean_terminated_length": 557.75, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.5994325509414495, + "grad_norm": 0.6883637309074402, + "kl": 0.12890625, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 297318364.0, + "reward": 1.4714287519454956, + "reward_std": 0.18093356490135193, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.43937474489212036, + "step": 2519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 554.3482666015625, + "completions/mean_terminated_length": 522.4414672851562, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 2.6004642765024504, + "grad_norm": 0.6628627777099609, + "kl": 0.1385498046875, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 297437158.0, + "reward": 1.5455358028411865, + "reward_std": 0.21819821000099182, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5544642806053162, + "rewards/curriculum_aware_reward_fn/std": 0.43119117617607117, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 547.25, + "completions/mean_terminated_length": 515.279296875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 2.601496002063451, + "grad_norm": 0.673978865146637, + "kl": 0.14794921875, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 297566210.0, + "reward": 1.531250238418579, + "reward_std": 0.22155684232711792, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.53125, + "rewards/curriculum_aware_reward_fn/std": 0.39066386222839355, + "step": 2521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3205.0, + "completions/max_terminated_length": 3205.0, + "completions/mean_length": 599.8303833007812, + "completions/mean_terminated_length": 599.8303833007812, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 2.602527727624452, + "grad_norm": 0.7220893502235413, + "kl": 0.127685546875, + "learning_rate": 1e-06, + "loss": 0.0695, + "num_tokens": 297710517.0, + "reward": 1.4861608743667603, + "reward_std": 0.21848052740097046, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4861607253551483, + "rewards/curriculum_aware_reward_fn/std": 0.41774865984916687, + "step": 2522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2624.0, + "completions/max_terminated_length": 2624.0, + "completions/mean_length": 521.732177734375, + "completions/mean_terminated_length": 521.732177734375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 2.6035594531854525, + "grad_norm": 0.6924219131469727, + "kl": 0.1416015625, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 297828591.0, + "reward": 1.5562502145767212, + "reward_std": 0.19886499643325806, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5562499761581421, + "rewards/curriculum_aware_reward_fn/std": 0.5339013338088989, + "step": 2523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2363.0, + "completions/max_terminated_length": 2363.0, + "completions/mean_length": 526.1875, + "completions/mean_terminated_length": 526.1875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 2.6045911787464533, + "grad_norm": 0.6960251927375793, + "kl": 0.128662109375, + "learning_rate": 1e-06, + "loss": 0.0337, + "num_tokens": 297961640.0, + "reward": 1.5500000715255737, + "reward_std": 0.24628612399101257, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5589285492897034, + "rewards/curriculum_aware_reward_fn/std": 0.4280804693698883, + "step": 2524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2201.0, + "completions/max_terminated_length": 2201.0, + "completions/mean_length": 553.5535888671875, + "completions/mean_terminated_length": 553.5535888671875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.605622904307454, + "grad_norm": 0.6908318400382996, + "kl": 0.1285400390625, + "learning_rate": 1e-06, + "loss": -0.0614, + "num_tokens": 298090236.0, + "reward": 1.440178632736206, + "reward_std": 0.20642748475074768, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4401785731315613, + "rewards/curriculum_aware_reward_fn/std": 0.3992582857608795, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 617.3482666015625, + "completions/mean_terminated_length": 586.009033203125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 2.606654629868455, + "grad_norm": 0.5960627794265747, + "kl": 0.13525390625, + "learning_rate": 1e-06, + "loss": 0.024, + "num_tokens": 298226439.0, + "reward": 1.3562501668930054, + "reward_std": 0.17817705869674683, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35625001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.40637513995170593, + "step": 2526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1974.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 567.3214721679688, + "completions/mean_terminated_length": 567.3214721679688, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.607686355429456, + "grad_norm": 0.6354534029960632, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": 0.0303, + "num_tokens": 298359160.0, + "reward": 1.4379465579986572, + "reward_std": 0.1600869745016098, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43794646859169006, + "rewards/curriculum_aware_reward_fn/std": 0.4330514669418335, + "step": 2527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2765.0, + "completions/max_terminated_length": 2765.0, + "completions/mean_length": 476.1607360839844, + "completions/mean_terminated_length": 476.1607360839844, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 2.6087180809904567, + "grad_norm": 0.6645422577857971, + "kl": 0.1376953125, + "learning_rate": 1e-06, + "loss": -0.0219, + "num_tokens": 298470590.0, + "reward": 1.6022323369979858, + "reward_std": 0.24282428622245789, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6111606955528259, + "rewards/curriculum_aware_reward_fn/std": 0.43109485507011414, + "step": 2528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3282.0, + "completions/max_terminated_length": 3282.0, + "completions/mean_length": 510.7410888671875, + "completions/mean_terminated_length": 510.7410888671875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.6097498065514575, + "grad_norm": 0.845571756362915, + "kl": 0.14794921875, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 298594391.0, + "reward": 1.505357265472412, + "reward_std": 0.19354453682899475, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5053571462631226, + "rewards/curriculum_aware_reward_fn/std": 0.4207113981246948, + "step": 2529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1820.0, + "completions/max_terminated_length": 1820.0, + "completions/mean_length": 572.3214721679688, + "completions/mean_terminated_length": 572.3214721679688, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 2.6107815321124583, + "grad_norm": 0.6434414982795715, + "kl": 0.142333984375, + "learning_rate": 1e-06, + "loss": 0.0562, + "num_tokens": 298721022.0, + "reward": 1.5629465579986572, + "reward_std": 0.21066059172153473, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5629464387893677, + "rewards/curriculum_aware_reward_fn/std": 0.41366007924079895, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1742.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 463.01788330078125, + "completions/mean_terminated_length": 463.01788330078125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.6118132576734587, + "grad_norm": 0.593588650226593, + "kl": 0.1396484375, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 298840143.0, + "reward": 1.5968750715255737, + "reward_std": 0.1363830864429474, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.596875011920929, + "rewards/curriculum_aware_reward_fn/std": 0.4179142415523529, + "step": 2531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1429.0, + "completions/max_terminated_length": 1429.0, + "completions/mean_length": 498.76788330078125, + "completions/mean_terminated_length": 498.76788330078125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 2.6128449832344596, + "grad_norm": 0.7416149377822876, + "kl": 0.1319580078125, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 298963800.0, + "reward": 1.48035728931427, + "reward_std": 0.1891711801290512, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4803571403026581, + "rewards/curriculum_aware_reward_fn/std": 0.4251844882965088, + "step": 2532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2005.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 556.6785888671875, + "completions/mean_terminated_length": 556.6785888671875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 2.6138767087954604, + "grad_norm": 0.7315823435783386, + "kl": 0.1357421875, + "learning_rate": 1e-06, + "loss": 0.03, + "num_tokens": 299091003.0, + "reward": 1.3558037281036377, + "reward_std": 0.25385361909866333, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.36473211646080017, + "rewards/curriculum_aware_reward_fn/std": 0.3844756782054901, + "step": 2533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1647.0, + "completions/max_terminated_length": 1647.0, + "completions/mean_length": 469.3660888671875, + "completions/mean_terminated_length": 469.3660888671875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 2.6149084343564613, + "grad_norm": 0.6682936549186707, + "kl": 0.135986328125, + "learning_rate": 1e-06, + "loss": -0.0119, + "num_tokens": 299204115.0, + "reward": 1.5580357313156128, + "reward_std": 0.1919260323047638, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5580357313156128, + "rewards/curriculum_aware_reward_fn/std": 0.422644704580307, + "step": 2534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1236.0, + "completions/max_terminated_length": 1236.0, + "completions/mean_length": 458.0535888671875, + "completions/mean_terminated_length": 458.0535888671875, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.615940159917462, + "grad_norm": 0.6064454317092896, + "kl": 0.13720703125, + "learning_rate": 1e-06, + "loss": -0.0192, + "num_tokens": 299328525.0, + "reward": 1.5906251668930054, + "reward_std": 0.18175432085990906, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5995535850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4349845051765442, + "step": 2535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1414.0, + "completions/max_terminated_length": 1414.0, + "completions/mean_length": 455.51788330078125, + "completions/mean_terminated_length": 455.51788330078125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 2.6169718854784625, + "grad_norm": 0.7744899988174438, + "kl": 0.130126953125, + "learning_rate": 1e-06, + "loss": 0.0569, + "num_tokens": 299441549.0, + "reward": 1.552232265472412, + "reward_std": 0.24021115899085999, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.561160683631897, + "rewards/curriculum_aware_reward_fn/std": 0.38687413930892944, + "step": 2536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1580.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 448.5714416503906, + "completions/mean_terminated_length": 448.5714416503906, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 2.6180036110394633, + "grad_norm": 0.5266478061676025, + "kl": 0.1337890625, + "learning_rate": 1e-06, + "loss": 0.0408, + "num_tokens": 299564078.0, + "reward": 1.6906249523162842, + "reward_std": 0.0914338007569313, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6906250715255737, + "rewards/curriculum_aware_reward_fn/std": 0.4176042079925537, + "step": 2537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1972.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 521.294677734375, + "completions/mean_terminated_length": 521.294677734375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.619035336600464, + "grad_norm": 0.6374189853668213, + "kl": 0.1285400390625, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 299684952.0, + "reward": 1.4250000715255737, + "reward_std": 0.14668266475200653, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.43959441781044006, + "step": 2538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3921.0, + "completions/max_terminated_length": 3921.0, + "completions/mean_length": 571.7589721679688, + "completions/mean_terminated_length": 571.7589721679688, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 2.620067062161465, + "grad_norm": 0.6008320450782776, + "kl": 0.11767578125, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 299816427.0, + "reward": 1.3428572416305542, + "reward_std": 0.22932888567447662, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.35178571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.39329564571380615, + "step": 2539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2890.0, + "completions/max_terminated_length": 2890.0, + "completions/mean_length": 609.6785888671875, + "completions/mean_terminated_length": 609.6785888671875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 2.621098787722466, + "grad_norm": 0.5431801676750183, + "kl": 0.1221923828125, + "learning_rate": 1e-06, + "loss": -0.0254, + "num_tokens": 299961053.0, + "reward": 1.5415178537368774, + "reward_std": 0.17546537518501282, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774, + "rewards/curriculum_aware_reward_fn/std": 0.45161813497543335, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1227.0, + "completions/max_terminated_length": 1227.0, + "completions/mean_length": 449.982177734375, + "completions/mean_terminated_length": 449.982177734375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 2.6221305132834667, + "grad_norm": 0.7313409447669983, + "kl": 0.13720703125, + "learning_rate": 1e-06, + "loss": -0.0164, + "num_tokens": 300079855.0, + "reward": 1.6004464626312256, + "reward_std": 0.1854362189769745, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.609375, + "rewards/curriculum_aware_reward_fn/std": 0.42162978649139404, + "step": 2541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3242.0, + "completions/max_terminated_length": 3242.0, + "completions/mean_length": 606.7857666015625, + "completions/mean_terminated_length": 606.7857666015625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.6231622388444675, + "grad_norm": 0.6683008074760437, + "kl": 0.106201171875, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 300214511.0, + "reward": 1.411160945892334, + "reward_std": 0.22779394686222076, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41116073727607727, + "rewards/curriculum_aware_reward_fn/std": 0.40944427251815796, + "step": 2542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 567.5089721679688, + "completions/mean_terminated_length": 535.7207641601562, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.6241939644054684, + "grad_norm": 0.5849696397781372, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": 0.0309, + "num_tokens": 300350543.0, + "reward": 1.4758929014205933, + "reward_std": 0.21480746567249298, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4758928120136261, + "rewards/curriculum_aware_reward_fn/std": 0.4028604030609131, + "step": 2543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1424.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 483.95538330078125, + "completions/mean_terminated_length": 483.95538330078125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.6252256899664688, + "grad_norm": 0.6758949756622314, + "kl": 0.1143798828125, + "learning_rate": 1e-06, + "loss": 0.0239, + "num_tokens": 300473262.0, + "reward": 1.5455358028411865, + "reward_std": 0.27307796478271484, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5633928775787354, + "rewards/curriculum_aware_reward_fn/std": 0.41405391693115234, + "step": 2544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3222.0, + "completions/max_terminated_length": 3222.0, + "completions/mean_length": 604.9553833007812, + "completions/mean_terminated_length": 604.9553833007812, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 2.6262574155274696, + "grad_norm": 0.6471558213233948, + "kl": 0.122314453125, + "learning_rate": 1e-06, + "loss": 0.027, + "num_tokens": 300611724.0, + "reward": 1.3008928298950195, + "reward_std": 0.23042990267276764, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.30089282989501953, + "rewards/curriculum_aware_reward_fn/std": 0.396492600440979, + "step": 2545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3957.0, + "completions/max_terminated_length": 3957.0, + "completions/mean_length": 596.3303833007812, + "completions/mean_terminated_length": 596.3303833007812, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 2.6272891410884704, + "grad_norm": 0.6921994090080261, + "kl": 0.11474609375, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 300742122.0, + "reward": 1.4174107313156128, + "reward_std": 0.22880640625953674, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4174107015132904, + "rewards/curriculum_aware_reward_fn/std": 0.41745203733444214, + "step": 2546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1735.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 408.3571472167969, + "completions/mean_terminated_length": 408.3571472167969, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.6283208666494713, + "grad_norm": 0.7727930545806885, + "kl": 0.1363525390625, + "learning_rate": 1e-06, + "loss": 0.0327, + "num_tokens": 300849696.0, + "reward": 1.610267996788025, + "reward_std": 0.22193004190921783, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6102678179740906, + "rewards/curriculum_aware_reward_fn/std": 0.49160236120224, + "step": 2547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 487.9285888671875, + "completions/mean_terminated_length": 487.9285888671875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 2.629352592210472, + "grad_norm": 0.603735089302063, + "kl": 0.12255859375, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 300972384.0, + "reward": 1.5267857313156128, + "reward_std": 0.12053783982992172, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5267857313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4188104569911957, + "step": 2548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2411.0, + "completions/max_terminated_length": 2411.0, + "completions/mean_length": 566.6607666015625, + "completions/mean_terminated_length": 566.6607666015625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 2.6303843177714725, + "grad_norm": 0.5660789608955383, + "kl": 0.1123046875, + "learning_rate": 1e-06, + "loss": 0.0688, + "num_tokens": 301105752.0, + "reward": 1.4008928537368774, + "reward_std": 0.2004423588514328, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40089282393455505, + "rewards/curriculum_aware_reward_fn/std": 0.4820649325847626, + "step": 2549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3659.0, + "completions/max_terminated_length": 3659.0, + "completions/mean_length": 515.2857666015625, + "completions/mean_terminated_length": 515.2857666015625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 2.6314160433324734, + "grad_norm": 0.5825809240341187, + "kl": 0.1177978515625, + "learning_rate": 1e-06, + "loss": 0.048, + "num_tokens": 301230902.0, + "reward": 1.4799107313156128, + "reward_std": 0.16736774146556854, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4888392984867096, + "rewards/curriculum_aware_reward_fn/std": 0.43093812465667725, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2199.0, + "completions/max_terminated_length": 2199.0, + "completions/mean_length": 524.2678833007812, + "completions/mean_terminated_length": 524.2678833007812, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.632447768893474, + "grad_norm": 0.6821950078010559, + "kl": 0.1357421875, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 301361853.0, + "reward": 1.5075894594192505, + "reward_std": 0.24523700773715973, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098, + "rewards/curriculum_aware_reward_fn/std": 0.4395429491996765, + "step": 2551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1749.0, + "completions/max_terminated_length": 1749.0, + "completions/mean_length": 484.2857360839844, + "completions/mean_terminated_length": 484.2857360839844, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.633479494454475, + "grad_norm": 0.7281054854393005, + "kl": 0.130126953125, + "learning_rate": 1e-06, + "loss": -0.0172, + "num_tokens": 301483888.0, + "reward": 1.4991072416305542, + "reward_std": 0.23039059340953827, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5080357193946838, + "rewards/curriculum_aware_reward_fn/std": 0.42471790313720703, + "step": 2552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2207.0, + "completions/max_terminated_length": 2207.0, + "completions/mean_length": 474.27679443359375, + "completions/mean_terminated_length": 474.27679443359375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 2.634511220015476, + "grad_norm": 0.7366987466812134, + "kl": 0.1273193359375, + "learning_rate": 1e-06, + "loss": -0.0212, + "num_tokens": 301610850.0, + "reward": 1.4883930683135986, + "reward_std": 0.16512981057167053, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48839282989501953, + "rewards/curriculum_aware_reward_fn/std": 0.4101187586784363, + "step": 2553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1498.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 478.6607360839844, + "completions/mean_terminated_length": 478.6607360839844, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.6355429455764767, + "grad_norm": 0.7663374543190002, + "kl": 0.1181640625, + "learning_rate": 1e-06, + "loss": -0.0367, + "num_tokens": 301738373.0, + "reward": 1.5843751430511475, + "reward_std": 0.13005556166172028, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5843749642372131, + "rewards/curriculum_aware_reward_fn/std": 0.3850255310535431, + "step": 2554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1332.0, + "completions/max_terminated_length": 1332.0, + "completions/mean_length": 518.6964721679688, + "completions/mean_terminated_length": 518.6964721679688, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 2.6365746711374776, + "grad_norm": 0.656261682510376, + "kl": 0.1309814453125, + "learning_rate": 1e-06, + "loss": 0.0356, + "num_tokens": 301862337.0, + "reward": 1.3651787042617798, + "reward_std": 0.18952900171279907, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36517858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.38723084330558777, + "step": 2555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2099.0, + "completions/max_terminated_length": 2099.0, + "completions/mean_length": 520.4732666015625, + "completions/mean_terminated_length": 520.4732666015625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.6376063966984784, + "grad_norm": 0.6241275668144226, + "kl": 0.1243896484375, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 301995817.0, + "reward": 1.55848228931427, + "reward_std": 0.18228021264076233, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5584821701049805, + "rewards/curriculum_aware_reward_fn/std": 0.4161648452281952, + "step": 2556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1396.0, + "completions/max_terminated_length": 1396.0, + "completions/mean_length": 475.0982360839844, + "completions/mean_terminated_length": 475.0982360839844, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 2.6386381222594792, + "grad_norm": 0.6838352680206299, + "kl": 0.1373291015625, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 302114982.0, + "reward": 1.4785715341567993, + "reward_std": 0.18479207158088684, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47857144474983215, + "rewards/curriculum_aware_reward_fn/std": 0.4036831855773926, + "step": 2557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2031.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 562.3392944335938, + "completions/mean_terminated_length": 562.3392944335938, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 2.6396698478204796, + "grad_norm": 0.7007812261581421, + "kl": 0.1204833984375, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 302251226.0, + "reward": 1.380357265472412, + "reward_std": 0.22040167450904846, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.38035711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.38399022817611694, + "step": 2558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1885.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 521.9107666015625, + "completions/mean_terminated_length": 521.9107666015625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.6407015733814805, + "grad_norm": 0.7604739665985107, + "kl": 0.124267578125, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 302379274.0, + "reward": 1.4879463911056519, + "reward_std": 0.24727526307106018, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48794645071029663, + "rewards/curriculum_aware_reward_fn/std": 0.3997885584831238, + "step": 2559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2391.0, + "completions/max_terminated_length": 2391.0, + "completions/mean_length": 510.294677734375, + "completions/mean_terminated_length": 510.294677734375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.6417332989424813, + "grad_norm": 0.7016016244888306, + "kl": 0.1258544921875, + "learning_rate": 1e-06, + "loss": -0.0323, + "num_tokens": 302505951.0, + "reward": 1.4575893878936768, + "reward_std": 0.16625343263149261, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4665178656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4224226474761963, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1456.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 479.9464416503906, + "completions/mean_terminated_length": 479.9464416503906, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 2.642765024503482, + "grad_norm": 0.6770477890968323, + "kl": 0.1195068359375, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 302624770.0, + "reward": 1.5647321939468384, + "reward_std": 0.15951691567897797, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5647321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4194396734237671, + "step": 2561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2726.0, + "completions/max_terminated_length": 2726.0, + "completions/mean_length": 537.6785888671875, + "completions/mean_terminated_length": 537.6785888671875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 2.643796750064483, + "grad_norm": 0.7363307476043701, + "kl": 0.116455078125, + "learning_rate": 1e-06, + "loss": -0.0104, + "num_tokens": 302751000.0, + "reward": 1.5415178537368774, + "reward_std": 0.16541747748851776, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774, + "rewards/curriculum_aware_reward_fn/std": 0.39824438095092773, + "step": 2562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 508.4375305175781, + "completions/mean_terminated_length": 508.4375305175781, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 2.6448284756254834, + "grad_norm": 0.7421542406082153, + "kl": 0.1240234375, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 302873878.0, + "reward": 1.5535715818405151, + "reward_std": 0.2116728276014328, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5535714030265808, + "rewards/curriculum_aware_reward_fn/std": 0.39322611689567566, + "step": 2563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1970.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 564.5892944335938, + "completions/mean_terminated_length": 564.5892944335938, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 2.6458602011864842, + "grad_norm": 0.7115163207054138, + "kl": 0.11474609375, + "learning_rate": 1e-06, + "loss": -0.0162, + "num_tokens": 303002751.0, + "reward": 1.4352679252624512, + "reward_std": 0.16417628526687622, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.38541179895401, + "step": 2564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2077.0, + "completions/max_terminated_length": 2077.0, + "completions/mean_length": 513.625, + "completions/mean_terminated_length": 513.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.646891926747485, + "grad_norm": 0.6098670363426208, + "kl": 0.1158447265625, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 303134155.0, + "reward": 1.5218751430511475, + "reward_std": 0.16423504054546356, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579, + "rewards/curriculum_aware_reward_fn/std": 0.44178083539009094, + "step": 2565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2776.0, + "completions/max_terminated_length": 2776.0, + "completions/mean_length": 496.76788330078125, + "completions/mean_terminated_length": 496.76788330078125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 2.647923652308486, + "grad_norm": 0.7642014026641846, + "kl": 0.1285400390625, + "learning_rate": 1e-06, + "loss": 0.0455, + "num_tokens": 303257196.0, + "reward": 1.4812500476837158, + "reward_std": 0.20399358868598938, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.49910715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.4325694441795349, + "step": 2566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1803.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 557.1160888671875, + "completions/mean_terminated_length": 557.1160888671875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 2.6489553778694868, + "grad_norm": 0.6560217142105103, + "kl": 0.1214599609375, + "learning_rate": 1e-06, + "loss": 0.0226, + "num_tokens": 303389964.0, + "reward": 1.4401787519454956, + "reward_std": 0.24112722277641296, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4401785731315613, + "rewards/curriculum_aware_reward_fn/std": 0.3977888822555542, + "step": 2567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2036.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 537.2142944335938, + "completions/mean_terminated_length": 537.2142944335938, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.6499871034304876, + "grad_norm": 0.6953210234642029, + "kl": 0.1326904296875, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 303512992.0, + "reward": 1.541517972946167, + "reward_std": 0.17343279719352722, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5415178537368774, + "rewards/curriculum_aware_reward_fn/std": 0.4311564266681671, + "step": 2568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2856.0, + "completions/max_terminated_length": 2856.0, + "completions/mean_length": 517.669677734375, + "completions/mean_terminated_length": 517.669677734375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.6510188289914884, + "grad_norm": 0.6415396928787231, + "kl": 0.114013671875, + "learning_rate": 1e-06, + "loss": -0.0173, + "num_tokens": 303638171.0, + "reward": 1.446428656578064, + "reward_std": 0.16090556979179382, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4464285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.43973347544670105, + "step": 2569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1144.0, + "completions/max_terminated_length": 1144.0, + "completions/mean_length": 457.3660888671875, + "completions/mean_terminated_length": 457.3660888671875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 2.6520505545524893, + "grad_norm": 1.054216980934143, + "kl": 0.136962890625, + "learning_rate": 1e-06, + "loss": -0.0309, + "num_tokens": 303756085.0, + "reward": 1.4754465818405151, + "reward_std": 0.2000415027141571, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4754464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.43074390292167664, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1692.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 505.46429443359375, + "completions/mean_terminated_length": 505.46429443359375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.6530822801134897, + "grad_norm": 0.6751531958580017, + "kl": 0.1279296875, + "learning_rate": 1e-06, + "loss": -0.0585, + "num_tokens": 303874836.0, + "reward": 1.3433035612106323, + "reward_std": 0.21848861873149872, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3433035910129547, + "rewards/curriculum_aware_reward_fn/std": 0.41058602929115295, + "step": 2571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2091.0, + "completions/max_terminated_length": 2091.0, + "completions/mean_length": 484.1250305175781, + "completions/mean_terminated_length": 484.1250305175781, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 2.6541140056744905, + "grad_norm": 0.7291446924209595, + "kl": 0.148193359375, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 303996482.0, + "reward": 1.5625001192092896, + "reward_std": 0.20619072020053864, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5625, + "rewards/curriculum_aware_reward_fn/std": 0.4297233819961548, + "step": 2572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2233.0, + "completions/mean_length": 554.5267944335938, + "completions/mean_terminated_length": 522.6216430664062, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.6551457312354914, + "grad_norm": 0.6747497320175171, + "kl": 0.11474609375, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 304126818.0, + "reward": 1.4669644832611084, + "reward_std": 0.2268751561641693, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4669643044471741, + "rewards/curriculum_aware_reward_fn/std": 0.3921845555305481, + "step": 2573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2438.0, + "completions/max_terminated_length": 2438.0, + "completions/mean_length": 534.7589721679688, + "completions/mean_terminated_length": 534.7589721679688, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 2.656177456796492, + "grad_norm": 0.6876260638237, + "kl": 0.1258544921875, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 304252161.0, + "reward": 1.6241072416305542, + "reward_std": 0.2519568204879761, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6330356597900391, + "rewards/curriculum_aware_reward_fn/std": 0.37797191739082336, + "step": 2574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2329.0, + "completions/max_terminated_length": 2329.0, + "completions/mean_length": 524.0357666015625, + "completions/mean_terminated_length": 524.0357666015625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 2.657209182357493, + "grad_norm": 0.4913746416568756, + "kl": 0.14208984375, + "learning_rate": 1e-06, + "loss": 0.0323, + "num_tokens": 304370920.0, + "reward": 1.770535945892334, + "reward_std": 0.14546895027160645, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7705356478691101, + "rewards/curriculum_aware_reward_fn/std": 0.35852259397506714, + "step": 2575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3775.0, + "completions/max_terminated_length": 3775.0, + "completions/mean_length": 506.794677734375, + "completions/mean_terminated_length": 506.794677734375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 2.6582409079184934, + "grad_norm": 0.6894189715385437, + "kl": 0.131591796875, + "learning_rate": 1e-06, + "loss": 0.0248, + "num_tokens": 304489288.0, + "reward": 1.5232144594192505, + "reward_std": 0.16796723008155823, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5232142806053162, + "rewards/curriculum_aware_reward_fn/std": 0.43100178241729736, + "step": 2576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2335.0, + "completions/mean_length": 689.5000610351562, + "completions/mean_terminated_length": 658.8108520507812, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 2.6592726334794943, + "grad_norm": 0.6391696929931641, + "kl": 0.118896484375, + "learning_rate": 1e-06, + "loss": 0.0477, + "num_tokens": 304634608.0, + "reward": 1.2937500476837158, + "reward_std": 0.2136712223291397, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.29375001788139343, + "rewards/curriculum_aware_reward_fn/std": 0.400232195854187, + "step": 2577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1691.0, + "completions/max_terminated_length": 1691.0, + "completions/mean_length": 562.8303833007812, + "completions/mean_terminated_length": 562.8303833007812, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.660304359040495, + "grad_norm": 0.5994539260864258, + "kl": 0.1280517578125, + "learning_rate": 1e-06, + "loss": -0.0754, + "num_tokens": 304763151.0, + "reward": 1.520982265472412, + "reward_std": 0.14971788227558136, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5209821462631226, + "rewards/curriculum_aware_reward_fn/std": 0.4159347414970398, + "step": 2578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1690.0, + "completions/max_terminated_length": 1690.0, + "completions/mean_length": 548.9732666015625, + "completions/mean_terminated_length": 548.9732666015625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 2.661336084601496, + "grad_norm": 0.5327616930007935, + "kl": 0.1219482421875, + "learning_rate": 1e-06, + "loss": -0.0258, + "num_tokens": 304882141.0, + "reward": 1.5861608982086182, + "reward_std": 0.12342019379138947, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5861606597900391, + "rewards/curriculum_aware_reward_fn/std": 0.4475765526294708, + "step": 2579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3053.0, + "completions/mean_length": 714.8839721679688, + "completions/mean_terminated_length": 684.4234619140625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 2.662367810162497, + "grad_norm": 0.6151833534240723, + "kl": 0.1142578125, + "learning_rate": 1e-06, + "loss": 0.0252, + "num_tokens": 305038725.0, + "reward": 1.4794644117355347, + "reward_std": 0.23122107982635498, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4794642925262451, + "rewards/curriculum_aware_reward_fn/std": 0.40372997522354126, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3521.0, + "completions/max_terminated_length": 3521.0, + "completions/mean_length": 678.5982666015625, + "completions/mean_terminated_length": 678.5982666015625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 2.6633995357234976, + "grad_norm": 0.6278223991394043, + "kl": 0.115478515625, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 305184987.0, + "reward": 1.4450894594192505, + "reward_std": 0.23164869844913483, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.3950687050819397, + "step": 2581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2595.0, + "completions/mean_length": 720.169677734375, + "completions/mean_terminated_length": 689.7567749023438, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 2.6644312612844985, + "grad_norm": 0.5630807280540466, + "kl": 0.1201171875, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 305335114.0, + "reward": 1.3691965341567993, + "reward_std": 0.1900208741426468, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36919641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.38527193665504456, + "step": 2582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3228.0, + "completions/mean_length": 759.3750610351562, + "completions/mean_terminated_length": 698.7090454101562, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.6654629868454993, + "grad_norm": 0.5652838945388794, + "kl": 0.105224609375, + "learning_rate": 1e-06, + "loss": 0.0295, + "num_tokens": 305485075.0, + "reward": 1.4968751668930054, + "reward_std": 0.21129049360752106, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5147321224212646, + "rewards/curriculum_aware_reward_fn/std": 0.3854702413082123, + "step": 2583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2923.0, + "completions/mean_length": 713.6875610351562, + "completions/mean_terminated_length": 652.19091796875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 2.6664947124064997, + "grad_norm": 0.5213630199432373, + "kl": 0.1085205078125, + "learning_rate": 1e-06, + "loss": -0.0057, + "num_tokens": 305636870.0, + "reward": 1.5491071939468384, + "reward_std": 0.14842262864112854, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5491071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.40022414922714233, + "step": 2584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2775.0, + "completions/max_terminated_length": 2775.0, + "completions/mean_length": 682.5089721679688, + "completions/mean_terminated_length": 682.5089721679688, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.6675264379675006, + "grad_norm": 0.6713928580284119, + "kl": 0.12060546875, + "learning_rate": 1e-06, + "loss": -0.0121, + "num_tokens": 305781638.0, + "reward": 1.3986607789993286, + "reward_std": 0.2590733766555786, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.40758925676345825, + "rewards/curriculum_aware_reward_fn/std": 0.4249117374420166, + "step": 2585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2828.0, + "completions/max_terminated_length": 2828.0, + "completions/mean_length": 712.2767944335938, + "completions/mean_terminated_length": 712.2767944335938, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.6685581635285014, + "grad_norm": 0.5545308589935303, + "kl": 0.118896484375, + "learning_rate": 1e-06, + "loss": 0.0429, + "num_tokens": 305926933.0, + "reward": 1.5424107313156128, + "reward_std": 0.2068289816379547, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128, + "rewards/curriculum_aware_reward_fn/std": 0.3718840479850769, + "step": 2586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3692.0, + "completions/max_terminated_length": 3692.0, + "completions/mean_length": 728.669677734375, + "completions/mean_terminated_length": 728.669677734375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.6695898890895022, + "grad_norm": 0.5126776695251465, + "kl": 0.111083984375, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 306075366.0, + "reward": 1.410267949104309, + "reward_std": 0.19241558015346527, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4102678596973419, + "rewards/curriculum_aware_reward_fn/std": 0.4012446105480194, + "step": 2587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1849.0, + "completions/mean_length": 641.0178833007812, + "completions/mean_terminated_length": 609.8919067382812, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 2.670621614650503, + "grad_norm": 0.535484254360199, + "kl": 0.1072998046875, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 306211392.0, + "reward": 1.4098213911056519, + "reward_std": 0.1677635759115219, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40982145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.433030366897583, + "step": 2588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2886.0, + "completions/max_terminated_length": 2886.0, + "completions/mean_length": 670.8125, + "completions/mean_terminated_length": 670.8125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 2.6716533402115035, + "grad_norm": 0.5733050107955933, + "kl": 0.105712890625, + "learning_rate": 1e-06, + "loss": -0.009, + "num_tokens": 306352806.0, + "reward": 1.5258928537368774, + "reward_std": 0.15302883088588715, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5258928537368774, + "rewards/curriculum_aware_reward_fn/std": 0.41394510865211487, + "step": 2589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2376.0, + "completions/mean_length": 740.1160888671875, + "completions/mean_terminated_length": 709.8828735351562, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 2.6726850657725043, + "grad_norm": 0.5498374700546265, + "kl": 0.1051025390625, + "learning_rate": 1e-06, + "loss": -0.0322, + "num_tokens": 306506861.0, + "reward": 1.5299108028411865, + "reward_std": 0.23045654594898224, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.529910683631897, + "rewards/curriculum_aware_reward_fn/std": 0.40821850299835205, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3696.0, + "completions/max_terminated_length": 3696.0, + "completions/mean_length": 621.982177734375, + "completions/mean_terminated_length": 621.982177734375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 2.673716791333505, + "grad_norm": 0.6307698488235474, + "kl": 0.1104736328125, + "learning_rate": 1e-06, + "loss": 0.0456, + "num_tokens": 306633118.0, + "reward": 1.675446629524231, + "reward_std": 0.2829442024230957, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6754463911056519, + "rewards/curriculum_aware_reward_fn/std": 0.5178783535957336, + "step": 2591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2641.0, + "completions/mean_length": 717.9285888671875, + "completions/mean_terminated_length": 687.4954833984375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 2.674748516894506, + "grad_norm": 0.4680657684803009, + "kl": 0.1002197265625, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 306780957.0, + "reward": 1.5120537281036377, + "reward_std": 0.2273402363061905, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5209821462631226, + "rewards/curriculum_aware_reward_fn/std": 0.5374547243118286, + "step": 2592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2491.0, + "completions/mean_length": 657.1339721679688, + "completions/mean_terminated_length": 626.1531982421875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 2.675780242455507, + "grad_norm": 0.5426839590072632, + "kl": 0.1026611328125, + "learning_rate": 1e-06, + "loss": -0.0374, + "num_tokens": 306921364.0, + "reward": 1.3901787996292114, + "reward_std": 0.1473875343799591, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3901785910129547, + "rewards/curriculum_aware_reward_fn/std": 0.42218008637428284, + "step": 2593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2568.0, + "completions/max_terminated_length": 2568.0, + "completions/mean_length": 586.9107666015625, + "completions/mean_terminated_length": 586.9107666015625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.6768119680165077, + "grad_norm": 0.5409964323043823, + "kl": 0.1136474609375, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 307055705.0, + "reward": 1.557142972946167, + "reward_std": 0.16628770530223846, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5571428537368774, + "rewards/curriculum_aware_reward_fn/std": 0.4004421532154083, + "step": 2594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2072.0, + "completions/mean_length": 598.7232666015625, + "completions/mean_terminated_length": 567.2162475585938, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.6778436935775085, + "grad_norm": 0.5737888813018799, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": 0.0354, + "num_tokens": 307191040.0, + "reward": 1.4977679252624512, + "reward_std": 0.14505282044410706, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4977678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.449142724275589, + "step": 2595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3766.0, + "completions/max_terminated_length": 3766.0, + "completions/mean_length": 663.5535888671875, + "completions/mean_terminated_length": 663.5535888671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 2.6788754191385094, + "grad_norm": 0.619125485420227, + "kl": 0.112548828125, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 307330570.0, + "reward": 1.4647324085235596, + "reward_std": 0.22834059596061707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46473217010498047, + "rewards/curriculum_aware_reward_fn/std": 0.42466938495635986, + "step": 2596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3225.0, + "completions/mean_length": 693.1607666015625, + "completions/mean_terminated_length": 662.5045166015625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 2.67990714469951, + "grad_norm": 0.4550981819629669, + "kl": 0.0999755859375, + "learning_rate": 1e-06, + "loss": 0.0339, + "num_tokens": 307474676.0, + "reward": 1.533928632736206, + "reward_std": 0.11954943090677261, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5339285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.45208296179771423, + "step": 2597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1921.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 548.3303833007812, + "completions/mean_terminated_length": 548.3303833007812, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 2.6809388702605106, + "grad_norm": 0.6486082673072815, + "kl": 0.1171875, + "learning_rate": 1e-06, + "loss": 0.0423, + "num_tokens": 307594133.0, + "reward": 1.4955357313156128, + "reward_std": 0.19790183007717133, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4955357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.40177270770072937, + "step": 2598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 614.2857666015625, + "completions/mean_terminated_length": 614.2857666015625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 2.6819705958215114, + "grad_norm": 0.6386866569519043, + "kl": 0.10986328125, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 307734018.0, + "reward": 1.4941965341567993, + "reward_std": 0.18490223586559296, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49419641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.43323346972465515, + "step": 2599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3371.0, + "completions/max_terminated_length": 3371.0, + "completions/mean_length": 541.107177734375, + "completions/mean_terminated_length": 541.107177734375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.6830023213825123, + "grad_norm": 0.6429569125175476, + "kl": 0.1248779296875, + "learning_rate": 1e-06, + "loss": 0.0343, + "num_tokens": 307858861.0, + "reward": 1.5316965579986572, + "reward_std": 0.1632005274295807, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5316964387893677, + "rewards/curriculum_aware_reward_fn/std": 0.41940516233444214, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1910.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 525.919677734375, + "completions/mean_terminated_length": 525.919677734375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.684034046943513, + "grad_norm": 0.6729943752288818, + "kl": 0.11474609375, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 307981541.0, + "reward": 1.5808037519454956, + "reward_std": 0.23689059913158417, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5808035731315613, + "rewards/curriculum_aware_reward_fn/std": 0.42501965165138245, + "step": 2601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1432.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 522.9285888671875, + "completions/mean_terminated_length": 522.9285888671875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 2.685065772504514, + "grad_norm": 0.5555217862129211, + "kl": 0.108154296875, + "learning_rate": 1e-06, + "loss": 0.0684, + "num_tokens": 308102395.0, + "reward": 1.6214287281036377, + "reward_std": 0.1849846988916397, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6303571462631226, + "rewards/curriculum_aware_reward_fn/std": 0.4196929931640625, + "step": 2602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3274.0, + "completions/max_terminated_length": 3274.0, + "completions/mean_length": 547.8482666015625, + "completions/mean_terminated_length": 547.8482666015625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 2.6860974980655143, + "grad_norm": 0.6727123856544495, + "kl": 0.113037109375, + "learning_rate": 1e-06, + "loss": -0.0696, + "num_tokens": 308227496.0, + "reward": 1.6245537996292114, + "reward_std": 0.19118796288967133, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6245535612106323, + "rewards/curriculum_aware_reward_fn/std": 0.4422503709793091, + "step": 2603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1702.0, + "completions/max_terminated_length": 1702.0, + "completions/mean_length": 525.8660888671875, + "completions/mean_terminated_length": 525.8660888671875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 2.687129223626515, + "grad_norm": 0.6231337189674377, + "kl": 0.1219482421875, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 308362511.0, + "reward": 1.5691964626312256, + "reward_std": 0.1808921843767166, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5691964030265808, + "rewards/curriculum_aware_reward_fn/std": 0.45053747296333313, + "step": 2604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3166.0, + "completions/max_terminated_length": 3166.0, + "completions/mean_length": 567.9732666015625, + "completions/mean_terminated_length": 567.9732666015625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.688160949187516, + "grad_norm": 0.5709600448608398, + "kl": 0.112060546875, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 308504111.0, + "reward": 1.540178656578064, + "reward_std": 0.14044509828090668, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5491071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.45167267322540283, + "step": 2605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2727.0, + "completions/max_terminated_length": 2727.0, + "completions/mean_length": 446.9732360839844, + "completions/mean_terminated_length": 446.9732360839844, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 2.689192674748517, + "grad_norm": 0.711565375328064, + "kl": 0.111083984375, + "learning_rate": 1e-06, + "loss": -0.0235, + "num_tokens": 308615912.0, + "reward": 1.7169643640518188, + "reward_std": 0.17919939756393433, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7169643044471741, + "rewards/curriculum_aware_reward_fn/std": 0.4117942154407501, + "step": 2606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1516.0, + "completions/max_terminated_length": 1516.0, + "completions/mean_length": 549.8660888671875, + "completions/mean_terminated_length": 549.8660888671875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 2.6902244003095177, + "grad_norm": 0.6628137826919556, + "kl": 0.11865234375, + "learning_rate": 1e-06, + "loss": 0.0278, + "num_tokens": 308738551.0, + "reward": 1.5236608982086182, + "reward_std": 0.2348913550376892, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5236607193946838, + "rewards/curriculum_aware_reward_fn/std": 0.4373837113380432, + "step": 2607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 593.5089721679688, + "completions/mean_terminated_length": 561.9549560546875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 2.6912561258705185, + "grad_norm": 0.6050135493278503, + "kl": 0.104248046875, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 308877957.0, + "reward": 1.4187501668930054, + "reward_std": 0.23862913250923157, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.42767858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.4206836521625519, + "step": 2608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3093.0, + "completions/max_terminated_length": 3093.0, + "completions/mean_length": 484.8750305175781, + "completions/mean_terminated_length": 484.8750305175781, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.6922878514315194, + "grad_norm": 0.6855359673500061, + "kl": 0.123291015625, + "learning_rate": 1e-06, + "loss": 0.0385, + "num_tokens": 308991490.0, + "reward": 1.5218751430511475, + "reward_std": 0.20726396143436432, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5218750238418579, + "rewards/curriculum_aware_reward_fn/std": 0.4614792466163635, + "step": 2609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2536.0, + "completions/max_terminated_length": 2536.0, + "completions/mean_length": 549.6964721679688, + "completions/mean_terminated_length": 549.6964721679688, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 2.6933195769925202, + "grad_norm": 0.6386505961418152, + "kl": 0.1151123046875, + "learning_rate": 1e-06, + "loss": 0.0554, + "num_tokens": 309129699.0, + "reward": 1.6308035850524902, + "reward_std": 0.20738892257213593, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6397321820259094, + "rewards/curriculum_aware_reward_fn/std": 0.4121541976928711, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2056.0, + "completions/max_terminated_length": 2056.0, + "completions/mean_length": 539.4732666015625, + "completions/mean_terminated_length": 539.4732666015625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.6943513025535206, + "grad_norm": 0.6469184756278992, + "kl": 0.1051025390625, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 309251454.0, + "reward": 1.4625000953674316, + "reward_std": 0.21683713793754578, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.47142860293388367, + "rewards/curriculum_aware_reward_fn/std": 0.4458373188972473, + "step": 2611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1871.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 487.669677734375, + "completions/mean_terminated_length": 487.669677734375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 2.6953830281145215, + "grad_norm": 0.6858476400375366, + "kl": 0.1148681640625, + "learning_rate": 1e-06, + "loss": -0.0191, + "num_tokens": 309375723.0, + "reward": 1.3611608743667603, + "reward_std": 0.1952558159828186, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3611606955528259, + "rewards/curriculum_aware_reward_fn/std": 0.3955672085285187, + "step": 2612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1518.0, + "completions/max_terminated_length": 1518.0, + "completions/mean_length": 524.4910888671875, + "completions/mean_terminated_length": 524.4910888671875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 2.6964147536755223, + "grad_norm": 0.7016357183456421, + "kl": 0.129638671875, + "learning_rate": 1e-06, + "loss": 0.0334, + "num_tokens": 309501393.0, + "reward": 1.4205358028411865, + "reward_std": 0.1922624260187149, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42053571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3992421627044678, + "step": 2613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2341.0, + "completions/max_terminated_length": 2341.0, + "completions/mean_length": 634.7410888671875, + "completions/mean_terminated_length": 634.7410888671875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 2.697446479236523, + "grad_norm": 0.6351801156997681, + "kl": 0.1048583984375, + "learning_rate": 1e-06, + "loss": -0.035, + "num_tokens": 309633706.0, + "reward": 1.4013392925262451, + "reward_std": 0.21559014916419983, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40133926272392273, + "rewards/curriculum_aware_reward_fn/std": 0.4269871115684509, + "step": 2614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1732.0, + "completions/max_terminated_length": 1732.0, + "completions/mean_length": 546.3928833007812, + "completions/mean_terminated_length": 546.3928833007812, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 2.698478204797524, + "grad_norm": 0.7185891270637512, + "kl": 0.13037109375, + "learning_rate": 1e-06, + "loss": -0.0217, + "num_tokens": 309758427.0, + "reward": 1.3339287042617798, + "reward_std": 0.181352436542511, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.33392858505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3564220666885376, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 467.33929443359375, + "completions/mean_terminated_length": 467.33929443359375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.6995099303585244, + "grad_norm": 0.6989985704421997, + "kl": 0.127685546875, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 309878459.0, + "reward": 1.5276787281036377, + "reward_std": 0.2159861922264099, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5366071462631226, + "rewards/curriculum_aware_reward_fn/std": 0.4386703073978424, + "step": 2616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1435.0, + "completions/mean_length": 503.3214416503906, + "completions/mean_terminated_length": 470.9549560546875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 2.700541655919525, + "grad_norm": 0.5956876277923584, + "kl": 0.110107421875, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 310002801.0, + "reward": 1.591071605682373, + "reward_std": 0.143727108836174, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5910714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.42225342988967896, + "step": 2617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2377.0, + "completions/max_terminated_length": 2377.0, + "completions/mean_length": 579.8482666015625, + "completions/mean_terminated_length": 579.8482666015625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 2.701573381480526, + "grad_norm": 0.7142231464385986, + "kl": 0.1292724609375, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 310140138.0, + "reward": 1.5075894594192505, + "reward_std": 0.1956642121076584, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5075892806053162, + "rewards/curriculum_aware_reward_fn/std": 0.4249117374420166, + "step": 2618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2371.0, + "completions/max_terminated_length": 2371.0, + "completions/mean_length": 528.4375, + "completions/mean_terminated_length": 528.4375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.702605107041527, + "grad_norm": 0.6770662069320679, + "kl": 0.104248046875, + "learning_rate": 1e-06, + "loss": 0.0519, + "num_tokens": 310259243.0, + "reward": 1.4629465341567993, + "reward_std": 0.17913131415843964, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.47187501192092896, + "rewards/curriculum_aware_reward_fn/std": 0.42937105894088745, + "step": 2619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1923.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 538.919677734375, + "completions/mean_terminated_length": 538.919677734375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 2.7036368326025277, + "grad_norm": 0.7300458550453186, + "kl": 0.1259765625, + "learning_rate": 1e-06, + "loss": 0.0344, + "num_tokens": 310384121.0, + "reward": 1.4950894117355347, + "reward_std": 0.20447319746017456, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.3622974455356598, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1927.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 550.2142944335938, + "completions/mean_terminated_length": 550.2142944335938, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 2.7046685581635286, + "grad_norm": 0.647113561630249, + "kl": 0.1112060546875, + "learning_rate": 1e-06, + "loss": 0.0539, + "num_tokens": 310508759.0, + "reward": 1.5254465341567993, + "reward_std": 0.2031795084476471, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5254464149475098, + "rewards/curriculum_aware_reward_fn/std": 0.4302730858325958, + "step": 2621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3646.0, + "completions/max_terminated_length": 3646.0, + "completions/mean_length": 577.2142944335938, + "completions/mean_terminated_length": 577.2142944335938, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 2.7057002837245294, + "grad_norm": 4.672196865081787, + "kl": 0.124755859375, + "learning_rate": 1e-06, + "loss": -0.0161, + "num_tokens": 310644422.0, + "reward": 1.4906251430511475, + "reward_std": 0.19837768375873566, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4906250536441803, + "rewards/curriculum_aware_reward_fn/std": 0.3786255121231079, + "step": 2622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1268.0, + "completions/max_terminated_length": 1268.0, + "completions/mean_length": 491.669677734375, + "completions/mean_terminated_length": 491.669677734375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.7067320092855303, + "grad_norm": 0.5772094130516052, + "kl": 0.125244140625, + "learning_rate": 1e-06, + "loss": -0.0382, + "num_tokens": 310763245.0, + "reward": 1.5593750476837158, + "reward_std": 0.15548592805862427, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.559374988079071, + "rewards/curriculum_aware_reward_fn/std": 0.3872129023075104, + "step": 2623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1660.0, + "completions/max_terminated_length": 1660.0, + "completions/mean_length": 440.0625305175781, + "completions/mean_terminated_length": 440.0625305175781, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.7077637348465307, + "grad_norm": 0.700062096118927, + "kl": 0.13427734375, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 310881678.0, + "reward": 1.3589287996292114, + "reward_std": 0.20154716074466705, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3678571581840515, + "rewards/curriculum_aware_reward_fn/std": 0.3995412290096283, + "step": 2624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1808.0, + "completions/max_terminated_length": 1808.0, + "completions/mean_length": 487.0714416503906, + "completions/mean_terminated_length": 487.0714416503906, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 2.7087954604075315, + "grad_norm": 0.5862681269645691, + "kl": 0.119140625, + "learning_rate": 1e-06, + "loss": 0.0409, + "num_tokens": 310998180.0, + "reward": 1.5602679252624512, + "reward_std": 0.15142853558063507, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5602678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4182297885417938, + "step": 2625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2639.0, + "completions/max_terminated_length": 2639.0, + "completions/mean_length": 518.1517944335938, + "completions/mean_terminated_length": 518.1517944335938, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.7098271859685323, + "grad_norm": 0.7043747901916504, + "kl": 0.123779296875, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 311122881.0, + "reward": 1.4928573369979858, + "reward_std": 0.20403318107128143, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4928571879863739, + "rewards/curriculum_aware_reward_fn/std": 0.42036324739456177, + "step": 2626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2773.0, + "completions/mean_length": 614.2857666015625, + "completions/mean_terminated_length": 582.9189453125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.710858911529533, + "grad_norm": 0.41442379355430603, + "kl": 0.1044921875, + "learning_rate": 1e-06, + "loss": -0.0121, + "num_tokens": 311267316.0, + "reward": 1.4424108266830444, + "reward_std": 0.08449666202068329, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44241073727607727, + "rewards/curriculum_aware_reward_fn/std": 0.4626213014125824, + "step": 2627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2518.0, + "completions/mean_length": 618.9285888671875, + "completions/mean_terminated_length": 587.6036376953125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.711890637090534, + "grad_norm": 0.663968563079834, + "kl": 0.1131591796875, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 311416212.0, + "reward": 1.504910945892334, + "reward_std": 0.23447324335575104, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5049107670783997, + "rewards/curriculum_aware_reward_fn/std": 0.40380844473838806, + "step": 2628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1670.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 483.4910888671875, + "completions/mean_terminated_length": 483.4910888671875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.712922362651535, + "grad_norm": 148691.9375, + "kl": 2608.08203125, + "learning_rate": 1e-06, + "loss": 26.2699, + "num_tokens": 311540013.0, + "reward": 1.6138393878936768, + "reward_std": 0.16646739840507507, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6227678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4234078824520111, + "step": 2629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 457.5982360839844, + "completions/mean_terminated_length": 457.5982360839844, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.7139540882125353, + "grad_norm": 0.6651673316955566, + "kl": 0.124755859375, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 311651317.0, + "reward": 1.5125001668930054, + "reward_std": 0.18927127122879028, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.512499988079071, + "rewards/curriculum_aware_reward_fn/std": 0.45393672585487366, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3100.0, + "completions/max_terminated_length": 3100.0, + "completions/mean_length": 535.6160888671875, + "completions/mean_terminated_length": 535.6160888671875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.714985813773536, + "grad_norm": 0.7526648640632629, + "kl": 0.123291015625, + "learning_rate": 1e-06, + "loss": 0.0602, + "num_tokens": 311778660.0, + "reward": 1.4522321224212646, + "reward_std": 0.20325230062007904, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45223215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.381579726934433, + "step": 2631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1229.0, + "completions/max_terminated_length": 1229.0, + "completions/mean_length": 466.107177734375, + "completions/mean_terminated_length": 466.107177734375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 2.716017539334537, + "grad_norm": 0.6304461359977722, + "kl": 0.118408203125, + "learning_rate": 1e-06, + "loss": 0.0294, + "num_tokens": 311901872.0, + "reward": 1.5357143878936768, + "reward_std": 0.2242029458284378, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.5000643134117126, + "step": 2632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1372.0, + "completions/max_terminated_length": 1372.0, + "completions/mean_length": 473.419677734375, + "completions/mean_terminated_length": 473.419677734375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 2.7170492648955378, + "grad_norm": 0.6883236169815063, + "kl": 0.1221923828125, + "learning_rate": 1e-06, + "loss": 0.0293, + "num_tokens": 312024517.0, + "reward": 1.7473214864730835, + "reward_std": 0.13166259229183197, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7473214268684387, + "rewards/curriculum_aware_reward_fn/std": 0.3198986053466797, + "step": 2633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1199.0, + "completions/max_terminated_length": 1199.0, + "completions/mean_length": 423.71429443359375, + "completions/mean_terminated_length": 423.71429443359375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.7180809904565386, + "grad_norm": 0.6701761484146118, + "kl": 0.108642578125, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 312133825.0, + "reward": 1.769642949104309, + "reward_std": 0.14054904878139496, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7696428298950195, + "rewards/curriculum_aware_reward_fn/std": 0.3735848367214203, + "step": 2634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1783.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 441.4464416503906, + "completions/mean_terminated_length": 441.4464416503906, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 2.7191127160175395, + "grad_norm": 0.7455883622169495, + "kl": 0.1475830078125, + "learning_rate": 1e-06, + "loss": 0.0452, + "num_tokens": 312248696.0, + "reward": 1.4531251192092896, + "reward_std": 0.16912443935871124, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.453125, + "rewards/curriculum_aware_reward_fn/std": 0.42681944370269775, + "step": 2635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1149.0, + "completions/mean_length": 571.0357666015625, + "completions/mean_terminated_length": 474.0183410644531, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.7201444415785403, + "grad_norm": 0.6181263327598572, + "kl": 0.11181640625, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 312384815.0, + "reward": 1.5370537042617798, + "reward_std": 0.20239168405532837, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5370535850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4414220154285431, + "step": 2636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3605.0, + "completions/mean_length": 512.0714721679688, + "completions/mean_terminated_length": 446.9090881347656, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.721176167139541, + "grad_norm": 0.6179798245429993, + "kl": 0.1160888671875, + "learning_rate": 1e-06, + "loss": -0.0371, + "num_tokens": 312509374.0, + "reward": 1.5450894832611084, + "reward_std": 0.1635485589504242, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5450892448425293, + "rewards/curriculum_aware_reward_fn/std": 0.43178656697273254, + "step": 2637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1103.0, + "completions/mean_length": 531.9375, + "completions/mean_terminated_length": 499.8288269042969, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.7222078927005415, + "grad_norm": 0.6158668994903564, + "kl": 0.1302490234375, + "learning_rate": 1e-06, + "loss": 0.0375, + "num_tokens": 312638909.0, + "reward": 1.4093750715255737, + "reward_std": 0.16171173751354218, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41830357909202576, + "rewards/curriculum_aware_reward_fn/std": 0.43262219429016113, + "step": 2638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1644.0, + "completions/max_terminated_length": 1644.0, + "completions/mean_length": 457.15179443359375, + "completions/mean_terminated_length": 457.15179443359375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 2.7232396182615424, + "grad_norm": 0.7206525206565857, + "kl": 0.1240234375, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 312757548.0, + "reward": 1.4991071224212646, + "reward_std": 0.16917501389980316, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49910715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.4314746558666229, + "step": 2639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2808.0, + "completions/max_terminated_length": 2808.0, + "completions/mean_length": 410.27679443359375, + "completions/mean_terminated_length": 410.27679443359375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 2.724271343822543, + "grad_norm": 0.7098089456558228, + "kl": 0.1295166015625, + "learning_rate": 1e-06, + "loss": 0.053, + "num_tokens": 312866520.0, + "reward": 1.6053574085235596, + "reward_std": 0.20159611105918884, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6053571701049805, + "rewards/curriculum_aware_reward_fn/std": 0.4335770606994629, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1616.0, + "completions/max_terminated_length": 1616.0, + "completions/mean_length": 533.107177734375, + "completions/mean_terminated_length": 533.107177734375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.725303069383544, + "grad_norm": 0.6682390570640564, + "kl": 0.1396484375, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 313002325.0, + "reward": 1.345089316368103, + "reward_std": 0.18961378931999207, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.34508928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.36415764689445496, + "step": 2641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2577.0, + "completions/max_terminated_length": 2577.0, + "completions/mean_length": 557.3035888671875, + "completions/mean_terminated_length": 557.3035888671875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 2.726334794944545, + "grad_norm": 0.5817402601242065, + "kl": 0.1025390625, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 313133861.0, + "reward": 1.4174107313156128, + "reward_std": 0.18837326765060425, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4174107611179352, + "rewards/curriculum_aware_reward_fn/std": 0.4443189799785614, + "step": 2642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1767.0, + "completions/max_terminated_length": 1767.0, + "completions/mean_length": 505.8482360839844, + "completions/mean_terminated_length": 505.8482360839844, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 2.7273665205055453, + "grad_norm": 0.7893069982528687, + "kl": 0.126953125, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 313251360.0, + "reward": 1.5683037042617798, + "reward_std": 0.22728142142295837, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5683035254478455, + "rewards/curriculum_aware_reward_fn/std": 0.3967205286026001, + "step": 2643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1756.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 526.2053833007812, + "completions/mean_terminated_length": 526.2053833007812, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 2.728398246066546, + "grad_norm": 1.0762910842895508, + "kl": 0.125, + "learning_rate": 1e-06, + "loss": -0.0875, + "num_tokens": 313373239.0, + "reward": 1.5075894594192505, + "reward_std": 0.2532075047492981, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5165178179740906, + "rewards/curriculum_aware_reward_fn/std": 0.5285279154777527, + "step": 2644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1864.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 533.5803833007812, + "completions/mean_terminated_length": 533.5803833007812, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 2.729429971627547, + "grad_norm": 0.5750779509544373, + "kl": 0.1351318359375, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 313503714.0, + "reward": 1.6111608743667603, + "reward_std": 0.21896274387836456, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6111606955528259, + "rewards/curriculum_aware_reward_fn/std": 0.4266309142112732, + "step": 2645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3007.0, + "completions/max_terminated_length": 3007.0, + "completions/mean_length": 578.0803833007812, + "completions/mean_terminated_length": 578.0803833007812, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 2.730461697188548, + "grad_norm": 0.5010013580322266, + "kl": 0.13671875, + "learning_rate": 1e-06, + "loss": 0.0382, + "num_tokens": 313643527.0, + "reward": 1.4477678537368774, + "reward_std": 0.1527036726474762, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.47455358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.43397367000579834, + "step": 2646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2310.0, + "completions/mean_length": 656.5714721679688, + "completions/mean_terminated_length": 594.0363159179688, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.7314934227495486, + "grad_norm": 0.5431753396987915, + "kl": 0.1512451171875, + "learning_rate": 1e-06, + "loss": 0.0567, + "num_tokens": 313783927.0, + "reward": 1.419196605682373, + "reward_std": 0.18558138608932495, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.4459821581840515, + "rewards/curriculum_aware_reward_fn/std": 0.41313666105270386, + "step": 2647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2747.0, + "completions/mean_length": 712.2500610351562, + "completions/mean_terminated_length": 681.7658081054688, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 2.7325251483105495, + "grad_norm": 0.5787481069564819, + "kl": 0.123046875, + "learning_rate": 1e-06, + "loss": 0.039, + "num_tokens": 313927588.0, + "reward": 1.3513394594192505, + "reward_std": 0.17803417146205902, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.36026784777641296, + "rewards/curriculum_aware_reward_fn/std": 0.37860211730003357, + "step": 2648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2207.0, + "completions/max_terminated_length": 2207.0, + "completions/mean_length": 581.9910888671875, + "completions/mean_terminated_length": 581.9910888671875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.7335568738715503, + "grad_norm": 0.6699755787849426, + "kl": 0.156494140625, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 314061700.0, + "reward": 1.5098215341567993, + "reward_std": 0.30378657579421997, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5098214149475098, + "rewards/curriculum_aware_reward_fn/std": 0.4118645489215851, + "step": 2649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3618.0, + "completions/mean_length": 695.2678833007812, + "completions/mean_terminated_length": 633.4363403320312, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 2.734588599432551, + "grad_norm": 0.4774550497531891, + "kl": 0.133056640625, + "learning_rate": 1e-06, + "loss": 0.0942, + "num_tokens": 314211089.0, + "reward": 1.4571430683135986, + "reward_std": 0.13818678259849548, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.47499996423721313, + "rewards/curriculum_aware_reward_fn/std": 0.4217616617679596, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4059.0, + "completions/max_terminated_length": 4059.0, + "completions/mean_length": 576.4642944335938, + "completions/mean_terminated_length": 576.4642944335938, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 2.7356203249935516, + "grad_norm": 0.4988778531551361, + "kl": 0.145263671875, + "learning_rate": 1e-06, + "loss": 0.0311, + "num_tokens": 314337967.0, + "reward": 1.5928571224212646, + "reward_std": 0.25025662779808044, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.610714316368103, + "rewards/curriculum_aware_reward_fn/std": 0.524036169052124, + "step": 2651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2997.0, + "completions/mean_length": 699.5267944335938, + "completions/mean_terminated_length": 637.772705078125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.7366520505545524, + "grad_norm": 0.5151804685592651, + "kl": 0.124267578125, + "learning_rate": 1e-06, + "loss": -0.0272, + "num_tokens": 314487112.0, + "reward": 1.4821430444717407, + "reward_std": 0.23278112709522247, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5, + "rewards/curriculum_aware_reward_fn/std": 0.43723124265670776, + "step": 2652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1494.0, + "completions/max_terminated_length": 1494.0, + "completions/mean_length": 571.2678833007812, + "completions/mean_terminated_length": 571.2678833007812, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.7376837761155532, + "grad_norm": 0.5588393211364746, + "kl": 0.1396484375, + "learning_rate": 1e-06, + "loss": 0.0351, + "num_tokens": 314613730.0, + "reward": 1.6312501430511475, + "reward_std": 0.23117785155773163, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1621822714805603, + "rewards/curriculum_aware_reward_fn/mean": 0.6580356955528259, + "rewards/curriculum_aware_reward_fn/std": 0.42598870396614075, + "step": 2653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1377.0, + "completions/max_terminated_length": 1377.0, + "completions/mean_length": 511.7500305175781, + "completions/mean_terminated_length": 511.7500305175781, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 2.738715501676554, + "grad_norm": 0.6131327748298645, + "kl": 0.15673828125, + "learning_rate": 1e-06, + "loss": 0.0306, + "num_tokens": 314738428.0, + "reward": 1.487946629524231, + "reward_std": 0.188005730509758, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5058035850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4395301043987274, + "step": 2654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1420.0, + "completions/max_terminated_length": 1420.0, + "completions/mean_length": 545.9017944335938, + "completions/mean_terminated_length": 545.9017944335938, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 2.739747227237555, + "grad_norm": 0.6534278392791748, + "kl": 0.14794921875, + "learning_rate": 1e-06, + "loss": -0.0196, + "num_tokens": 314865145.0, + "reward": 1.4830358028411865, + "reward_std": 0.29470551013946533, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48303571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.41190358996391296, + "step": 2655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2448.0, + "completions/max_terminated_length": 2448.0, + "completions/mean_length": 597.357177734375, + "completions/mean_terminated_length": 597.357177734375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.7407789527985553, + "grad_norm": 0.5453312993049622, + "kl": 0.144775390625, + "learning_rate": 1e-06, + "loss": 0.0692, + "num_tokens": 314995868.0, + "reward": 1.5629465579986572, + "reward_std": 0.17557427287101746, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5629464387893677, + "rewards/curriculum_aware_reward_fn/std": 0.40037962794303894, + "step": 2656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2168.0, + "completions/max_terminated_length": 2168.0, + "completions/mean_length": 551.7232666015625, + "completions/mean_terminated_length": 551.7232666015625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.741810678359556, + "grad_norm": 0.5800078511238098, + "kl": 0.156005859375, + "learning_rate": 1e-06, + "loss": -0.0146, + "num_tokens": 315122768.0, + "reward": 1.407142996788025, + "reward_std": 0.23318617045879364, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.40714284777641296, + "rewards/curriculum_aware_reward_fn/std": 0.44198858737945557, + "step": 2657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1561.0, + "completions/max_terminated_length": 1561.0, + "completions/mean_length": 497.919677734375, + "completions/mean_terminated_length": 497.919677734375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 2.742842403920557, + "grad_norm": 0.5884579420089722, + "kl": 0.156982421875, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 315244251.0, + "reward": 1.593750238418579, + "reward_std": 0.144059419631958, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.59375, + "rewards/curriculum_aware_reward_fn/std": 0.43599969148635864, + "step": 2658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2169.0, + "completions/max_terminated_length": 2169.0, + "completions/mean_length": 517.5535888671875, + "completions/mean_terminated_length": 517.5535888671875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 2.743874129481558, + "grad_norm": 0.5885396003723145, + "kl": 0.150146484375, + "learning_rate": 1e-06, + "loss": 0.0635, + "num_tokens": 315364020.0, + "reward": 1.4656251668930054, + "reward_std": 0.10819364339113235, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46562501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.43058890104293823, + "step": 2659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1594.0, + "completions/mean_length": 617.232177734375, + "completions/mean_terminated_length": 553.9818115234375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.7449058550425587, + "grad_norm": 0.6162183284759521, + "kl": 0.15185546875, + "learning_rate": 1e-06, + "loss": 0.0335, + "num_tokens": 315500742.0, + "reward": 1.3535715341567993, + "reward_std": 0.23010680079460144, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.36249998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.4042288064956665, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 586.7678833007812, + "completions/mean_terminated_length": 555.1531372070312, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 2.7459375806035595, + "grad_norm": 0.5109012126922607, + "kl": 0.15234375, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 315637447.0, + "reward": 1.4986608028411865, + "reward_std": 0.12022580206394196, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49866071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4184624254703522, + "step": 2661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1468.0, + "completions/mean_length": 581.5089721679688, + "completions/mean_terminated_length": 549.8468627929688, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 2.7469693061645604, + "grad_norm": 0.6965451240539551, + "kl": 0.146240234375, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 315777224.0, + "reward": 1.5075894594192505, + "reward_std": 0.15963546931743622, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5075892806053162, + "rewards/curriculum_aware_reward_fn/std": 0.40859073400497437, + "step": 2662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2698.0, + "completions/max_terminated_length": 2698.0, + "completions/mean_length": 538.8660888671875, + "completions/mean_terminated_length": 538.8660888671875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.748001031725561, + "grad_norm": 0.6566807627677917, + "kl": 0.160400390625, + "learning_rate": 1e-06, + "loss": -0.0306, + "num_tokens": 315907759.0, + "reward": 1.626339316368103, + "reward_std": 0.187081977725029, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6263392567634583, + "rewards/curriculum_aware_reward_fn/std": 0.44382449984550476, + "step": 2663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1671.0, + "completions/max_terminated_length": 1671.0, + "completions/mean_length": 505.3214416503906, + "completions/mean_terminated_length": 505.3214416503906, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.749032757286562, + "grad_norm": 0.6896383762359619, + "kl": 0.1728515625, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 316033662.0, + "reward": 1.60535728931427, + "reward_std": 0.19829097390174866, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6053571105003357, + "rewards/curriculum_aware_reward_fn/std": 0.5537487268447876, + "step": 2664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1456.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 576.625, + "completions/mean_terminated_length": 576.625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 2.7500644828475624, + "grad_norm": 0.6218982338905334, + "kl": 0.161865234375, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 316168118.0, + "reward": 1.5227679014205933, + "reward_std": 0.1973244547843933, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5227678418159485, + "rewards/curriculum_aware_reward_fn/std": 0.38054537773132324, + "step": 2665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1675.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 563.9464721679688, + "completions/mean_terminated_length": 563.9464721679688, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 2.7510962084085633, + "grad_norm": 0.7110979557037354, + "kl": 0.17333984375, + "learning_rate": 1e-06, + "loss": 0.0263, + "num_tokens": 316296198.0, + "reward": 1.5325894355773926, + "reward_std": 0.2403583526611328, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103, + "rewards/curriculum_aware_reward_fn/std": 0.4220131039619446, + "step": 2666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1559.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 504.76788330078125, + "completions/mean_terminated_length": 504.76788330078125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 2.752127933969564, + "grad_norm": 0.7367867827415466, + "kl": 0.220703125, + "learning_rate": 1e-06, + "loss": 0.0293, + "num_tokens": 316425630.0, + "reward": 1.5066965818405151, + "reward_std": 0.1925990879535675, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5066964030265808, + "rewards/curriculum_aware_reward_fn/std": 0.4251388609409332, + "step": 2667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2088.0, + "completions/max_terminated_length": 2088.0, + "completions/mean_length": 549.3214721679688, + "completions/mean_terminated_length": 549.3214721679688, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 2.753159659530565, + "grad_norm": 0.661249577999115, + "kl": 0.168701171875, + "learning_rate": 1e-06, + "loss": -0.0191, + "num_tokens": 316558449.0, + "reward": 1.5776787996292114, + "reward_std": 0.3079984784126282, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5955356955528259, + "rewards/curriculum_aware_reward_fn/std": 0.40695273876190186, + "step": 2668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3670.0, + "completions/max_terminated_length": 3670.0, + "completions/mean_length": 578.5535888671875, + "completions/mean_terminated_length": 578.5535888671875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 2.754191385091566, + "grad_norm": 0.5973184704780579, + "kl": 0.14990234375, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 316684805.0, + "reward": 1.5625001192092896, + "reward_std": 0.22135469317436218, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5714285969734192, + "rewards/curriculum_aware_reward_fn/std": 0.42973458766937256, + "step": 2669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3237.0, + "completions/max_terminated_length": 3237.0, + "completions/mean_length": 671.0178833007812, + "completions/mean_terminated_length": 671.0178833007812, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 2.755223110652566, + "grad_norm": 0.6458407640457153, + "kl": 0.146484375, + "learning_rate": 1e-06, + "loss": -0.0057, + "num_tokens": 316831023.0, + "reward": 1.447767972946167, + "reward_std": 0.3050350546836853, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44776788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.41173049807548523, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2096.0, + "completions/mean_length": 698.4285888671875, + "completions/mean_terminated_length": 636.654541015625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 2.756254836213567, + "grad_norm": 0.6006747484207153, + "kl": 0.139892578125, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 316986233.0, + "reward": 1.3883929252624512, + "reward_std": 0.22081252932548523, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3883928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4119270443916321, + "step": 2671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 509.9285888671875, + "completions/mean_terminated_length": 509.9285888671875, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.757286561774568, + "grad_norm": 0.600599467754364, + "kl": 0.156982421875, + "learning_rate": 1e-06, + "loss": -0.006, + "num_tokens": 317104266.0, + "reward": 1.6607143878936768, + "reward_std": 0.16374391317367554, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6607142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.3964611887931824, + "step": 2672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3150.0, + "completions/max_terminated_length": 3150.0, + "completions/mean_length": 566.7232666015625, + "completions/mean_terminated_length": 566.7232666015625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 2.7583182873355687, + "grad_norm": 0.5870814919471741, + "kl": 0.158203125, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 317239895.0, + "reward": 1.4950894117355347, + "reward_std": 0.10298654437065125, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.43774399161338806, + "step": 2673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3536.0, + "completions/max_terminated_length": 3536.0, + "completions/mean_length": 510.4464416503906, + "completions/mean_terminated_length": 510.4464416503906, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 2.7593500128965696, + "grad_norm": 0.5936353802680969, + "kl": 0.168701171875, + "learning_rate": 1e-06, + "loss": 0.0325, + "num_tokens": 317363628.0, + "reward": 1.5486608743667603, + "reward_std": 0.1814994066953659, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5486606955528259, + "rewards/curriculum_aware_reward_fn/std": 0.4584766924381256, + "step": 2674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1354.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 546.7410888671875, + "completions/mean_terminated_length": 546.7410888671875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 2.7603817384575704, + "grad_norm": 0.641977071762085, + "kl": 0.1669921875, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 317487359.0, + "reward": 1.5107144117355347, + "reward_std": 0.22570699453353882, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5107142329216003, + "rewards/curriculum_aware_reward_fn/std": 0.4437974989414215, + "step": 2675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3354.0, + "completions/max_terminated_length": 3354.0, + "completions/mean_length": 653.9464721679688, + "completions/mean_terminated_length": 653.9464721679688, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 2.7614134640185712, + "grad_norm": 0.5655720829963684, + "kl": 0.145263671875, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 317625925.0, + "reward": 1.5308035612106323, + "reward_std": 0.19269868731498718, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5308035612106323, + "rewards/curriculum_aware_reward_fn/std": 0.39990121126174927, + "step": 2676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2861.0, + "completions/mean_length": 652.357177734375, + "completions/mean_terminated_length": 621.3333740234375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.762445189579572, + "grad_norm": 0.5561245679855347, + "kl": 0.1455078125, + "learning_rate": 1e-06, + "loss": 0.0495, + "num_tokens": 317764264.0, + "reward": 1.4468750953674316, + "reward_std": 0.1468437761068344, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.45580360293388367, + "rewards/curriculum_aware_reward_fn/std": 0.426368772983551, + "step": 2677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3223.0, + "completions/max_terminated_length": 3223.0, + "completions/mean_length": 674.7142944335938, + "completions/mean_terminated_length": 674.7142944335938, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.7634769151405725, + "grad_norm": 0.5874546766281128, + "kl": 0.14306640625, + "learning_rate": 1e-06, + "loss": -0.0474, + "num_tokens": 317916468.0, + "reward": 1.4196429252624512, + "reward_std": 0.2191556692123413, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4196428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.40953484177589417, + "step": 2678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2168.0, + "completions/max_terminated_length": 2168.0, + "completions/mean_length": 636.607177734375, + "completions/mean_terminated_length": 636.607177734375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 2.7645086407015733, + "grad_norm": 0.5092126727104187, + "kl": 0.14453125, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 318062864.0, + "reward": 1.472321629524231, + "reward_std": 0.17760160565376282, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47232145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.42292627692222595, + "step": 2679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1744.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 626.794677734375, + "completions/mean_terminated_length": 626.794677734375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 2.765540366262574, + "grad_norm": 0.5636436939239502, + "kl": 0.13720703125, + "learning_rate": 1e-06, + "loss": 0.0336, + "num_tokens": 318204094.0, + "reward": 1.411607265472412, + "reward_std": 0.20242133736610413, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41160711646080017, + "rewards/curriculum_aware_reward_fn/std": 0.4180058538913727, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2572.0, + "completions/max_terminated_length": 2572.0, + "completions/mean_length": 590.5, + "completions/mean_terminated_length": 590.5, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 2.766572091823575, + "grad_norm": 0.6176509857177734, + "kl": 0.159912109375, + "learning_rate": 1e-06, + "loss": 0.0179, + "num_tokens": 318331025.0, + "reward": 1.4200893640518188, + "reward_std": 0.17646051943302155, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4200892746448517, + "rewards/curriculum_aware_reward_fn/std": 0.4145691394805908, + "step": 2681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2171.0, + "completions/mean_length": 650.919677734375, + "completions/mean_terminated_length": 619.8828735351562, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.767603817384576, + "grad_norm": 0.5832086205482483, + "kl": 0.15283203125, + "learning_rate": 1e-06, + "loss": 0.0233, + "num_tokens": 318476595.0, + "reward": 1.602678656578064, + "reward_std": 0.18887139856815338, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6026785969734192, + "rewards/curriculum_aware_reward_fn/std": 0.4144267439842224, + "step": 2682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 543.0267944335938, + "completions/mean_terminated_length": 543.0267944335938, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 2.7686355429455762, + "grad_norm": 0.6461904644966125, + "kl": 0.159423828125, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 318607561.0, + "reward": 1.472767949104309, + "reward_std": 0.24298816919326782, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.48169639706611633, + "rewards/curriculum_aware_reward_fn/std": 0.3800187110900879, + "step": 2683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2606.0, + "completions/mean_length": 685.482177734375, + "completions/mean_terminated_length": 654.7567749023438, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 2.769667268506577, + "grad_norm": 0.5030291080474854, + "kl": 0.134765625, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 318751794.0, + "reward": 1.432142972946167, + "reward_std": 0.18000416457653046, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.44107145071029663, + "rewards/curriculum_aware_reward_fn/std": 0.419148325920105, + "step": 2684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3369.0, + "completions/max_terminated_length": 3369.0, + "completions/mean_length": 612.4375, + "completions/mean_terminated_length": 612.4375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.770698994067578, + "grad_norm": 0.5694253444671631, + "kl": 0.154541015625, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 318899718.0, + "reward": 1.4370537996292114, + "reward_std": 0.15659162402153015, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4370535910129547, + "rewards/curriculum_aware_reward_fn/std": 0.429893434047699, + "step": 2685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2680.0, + "completions/max_terminated_length": 2680.0, + "completions/mean_length": 691.3214721679688, + "completions/mean_terminated_length": 691.3214721679688, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.7717307196285788, + "grad_norm": 0.5326395034790039, + "kl": 0.139892578125, + "learning_rate": 1e-06, + "loss": -0.0154, + "num_tokens": 319038200.0, + "reward": 1.462053656578064, + "reward_std": 0.19021055102348328, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4709821343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4151759743690491, + "step": 2686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2392.0, + "completions/max_terminated_length": 2392.0, + "completions/mean_length": 689.4642944335938, + "completions/mean_terminated_length": 689.4642944335938, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 2.7727624451895796, + "grad_norm": 0.5327898859977722, + "kl": 0.1422119140625, + "learning_rate": 1e-06, + "loss": 0.0102, + "num_tokens": 319175487.0, + "reward": 1.5566965341567993, + "reward_std": 0.17435990273952484, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5566964149475098, + "rewards/curriculum_aware_reward_fn/std": 0.4176023006439209, + "step": 2687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1527.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 625.7767944335938, + "completions/mean_terminated_length": 625.7767944335938, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.7737941707505804, + "grad_norm": 0.5624753832817078, + "kl": 0.151611328125, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 319319620.0, + "reward": 1.5593750476837158, + "reward_std": 0.2108374536037445, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5683035254478455, + "rewards/curriculum_aware_reward_fn/std": 0.4098997414112091, + "step": 2688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2804.0, + "completions/mean_length": 709.7767944335938, + "completions/mean_terminated_length": 648.2090454101562, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.7748258963115813, + "grad_norm": 0.4340974986553192, + "kl": 0.142578125, + "learning_rate": 1e-06, + "loss": 0.0671, + "num_tokens": 319458118.0, + "reward": 1.4397321939468384, + "reward_std": 0.15324297547340393, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4575892984867096, + "rewards/curriculum_aware_reward_fn/std": 0.4162904620170593, + "step": 2689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2372.0, + "completions/max_terminated_length": 2372.0, + "completions/mean_length": 643.1339721679688, + "completions/mean_terminated_length": 643.1339721679688, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 2.775857621872582, + "grad_norm": 0.49220940470695496, + "kl": 0.14306640625, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 319593765.0, + "reward": 1.5763393640518188, + "reward_std": 0.14425250887870789, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5763392448425293, + "rewards/curriculum_aware_reward_fn/std": 0.40196365118026733, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2623.0, + "completions/max_terminated_length": 2623.0, + "completions/mean_length": 591.482177734375, + "completions/mean_terminated_length": 591.482177734375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 2.7768893474335825, + "grad_norm": 0.6150352954864502, + "kl": 0.176513671875, + "learning_rate": 1e-06, + "loss": 0.0309, + "num_tokens": 319723004.0, + "reward": 1.5531251430511475, + "reward_std": 0.21724960207939148, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5620535612106323, + "rewards/curriculum_aware_reward_fn/std": 0.42214837670326233, + "step": 2691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1337.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 640.1160888671875, + "completions/mean_terminated_length": 640.1160888671875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 2.7779210729945834, + "grad_norm": 0.47986412048339844, + "kl": 0.14599609375, + "learning_rate": 1e-06, + "loss": 0.0389, + "num_tokens": 319856458.0, + "reward": 1.4339287281036377, + "reward_std": 0.2294609099626541, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.45178571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.46975788474082947, + "step": 2692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1712.0, + "completions/mean_length": 592.794677734375, + "completions/mean_terminated_length": 561.2342529296875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.778952798555584, + "grad_norm": 0.48358824849128723, + "kl": 0.15869140625, + "learning_rate": 1e-06, + "loss": -0.0188, + "num_tokens": 319982295.0, + "reward": 1.571428656578064, + "reward_std": 0.12172428518533707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5714285969734192, + "rewards/curriculum_aware_reward_fn/std": 0.4192289412021637, + "step": 2693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3854.0, + "completions/max_terminated_length": 3854.0, + "completions/mean_length": 612.732177734375, + "completions/mean_terminated_length": 612.732177734375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 2.779984524116585, + "grad_norm": 0.5338239669799805, + "kl": 0.14404296875, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 320106718.0, + "reward": 1.6000001430511475, + "reward_std": 0.1714448630809784, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131, + "rewards/curriculum_aware_reward_fn/std": 0.4356314539909363, + "step": 2694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1616.0, + "completions/max_terminated_length": 1616.0, + "completions/mean_length": 661.9285888671875, + "completions/mean_terminated_length": 661.9285888671875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 2.781016249677586, + "grad_norm": 0.5156787037849426, + "kl": 0.152099609375, + "learning_rate": 1e-06, + "loss": -0.02, + "num_tokens": 320257147.0, + "reward": 1.6071429252624512, + "reward_std": 0.18883086740970612, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6071428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4365020990371704, + "step": 2695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3227.0, + "completions/mean_length": 698.857177734375, + "completions/mean_terminated_length": 668.2522583007812, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 2.7820479752385863, + "grad_norm": 0.3876507878303528, + "kl": 0.1279296875, + "learning_rate": 1e-06, + "loss": 0.017, + "num_tokens": 320398419.0, + "reward": 1.4830358028411865, + "reward_std": 0.167936772108078, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5008928179740906, + "rewards/curriculum_aware_reward_fn/std": 0.4581092298030853, + "step": 2696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2221.0, + "completions/max_terminated_length": 2221.0, + "completions/mean_length": 693.8928833007812, + "completions/mean_terminated_length": 693.8928833007812, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 2.783079700799587, + "grad_norm": 0.5896779298782349, + "kl": 0.137939453125, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 320542614.0, + "reward": 1.4705358743667603, + "reward_std": 0.2531685531139374, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4794643223285675, + "rewards/curriculum_aware_reward_fn/std": 0.4202934205532074, + "step": 2697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2356.0, + "completions/mean_length": 774.2142944335938, + "completions/mean_terminated_length": 713.8181762695312, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 2.784111426360588, + "grad_norm": 0.4785737693309784, + "kl": 0.122314453125, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 320696736.0, + "reward": 1.4147323369979858, + "reward_std": 0.21173430979251862, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4236606955528259, + "rewards/curriculum_aware_reward_fn/std": 0.43407562375068665, + "step": 2698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2713.0, + "completions/mean_length": 766.0625610351562, + "completions/mean_terminated_length": 736.0631103515625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 2.785143151921589, + "grad_norm": 0.5446650385856628, + "kl": 0.1336669921875, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 320853571.0, + "reward": 1.4437501430511475, + "reward_std": 0.16912689805030823, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4437499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.4204847514629364, + "step": 2699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1931.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 634.2053833007812, + "completions/mean_terminated_length": 634.2053833007812, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.7861748774825896, + "grad_norm": 0.5768882632255554, + "kl": 0.140380859375, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 320986972.0, + "reward": 1.5616072416305542, + "reward_std": 0.2245289385318756, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5616071820259094, + "rewards/curriculum_aware_reward_fn/std": 0.44311460852622986, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1731.0, + "completions/mean_length": 669.357177734375, + "completions/mean_terminated_length": 638.4865112304688, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 2.7872066030435905, + "grad_norm": 0.5173462629318237, + "kl": 0.1390380859375, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 321138751.0, + "reward": 1.602678656578064, + "reward_std": 0.1530117690563202, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6116071343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4148690402507782, + "step": 2701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2842.0, + "completions/max_terminated_length": 2842.0, + "completions/mean_length": 564.1964721679688, + "completions/mean_terminated_length": 564.1964721679688, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 2.7882383286045913, + "grad_norm": 0.5467989444732666, + "kl": 0.17529296875, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 321256849.0, + "reward": 1.5928571224212646, + "reward_std": 0.1943163275718689, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5928570628166199, + "rewards/curriculum_aware_reward_fn/std": 0.41578415036201477, + "step": 2702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2809.0, + "completions/mean_length": 691.607177734375, + "completions/mean_terminated_length": 629.7090454101562, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 2.789270054165592, + "grad_norm": 0.5662619471549988, + "kl": 0.1351318359375, + "learning_rate": 1e-06, + "loss": 0.0591, + "num_tokens": 321401355.0, + "reward": 1.3928571939468384, + "reward_std": 0.18446427583694458, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3928571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4366568922996521, + "step": 2703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2381.0, + "completions/mean_length": 690.7857666015625, + "completions/mean_terminated_length": 660.108154296875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.790301779726593, + "grad_norm": 0.5307469964027405, + "kl": 0.128662109375, + "learning_rate": 1e-06, + "loss": 0.0268, + "num_tokens": 321552916.0, + "reward": 1.419196605682373, + "reward_std": 0.1603444665670395, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41919639706611633, + "rewards/curriculum_aware_reward_fn/std": 0.43605756759643555, + "step": 2704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3711.0, + "completions/mean_length": 635.1875, + "completions/mean_terminated_length": 604.009033203125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 2.7913335052875934, + "grad_norm": 0.48673635721206665, + "kl": 0.153564453125, + "learning_rate": 1e-06, + "loss": 0.0467, + "num_tokens": 321689850.0, + "reward": 1.6089287996292114, + "reward_std": 0.11687122285366058, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6178570985794067, + "rewards/curriculum_aware_reward_fn/std": 0.41551318764686584, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2425.0, + "completions/mean_length": 666.9375, + "completions/mean_terminated_length": 636.0450439453125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.7923652308485942, + "grad_norm": 0.5421416163444519, + "kl": 0.1376953125, + "learning_rate": 1e-06, + "loss": -0.0298, + "num_tokens": 321830536.0, + "reward": 1.5468751192092896, + "reward_std": 0.17485347390174866, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.546875, + "rewards/curriculum_aware_reward_fn/std": 0.4419082999229431, + "step": 2706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2176.0, + "completions/mean_length": 717.7589721679688, + "completions/mean_terminated_length": 687.3243408203125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.793396956409595, + "grad_norm": 0.5420852303504944, + "kl": 0.138671875, + "learning_rate": 1e-06, + "loss": -0.0489, + "num_tokens": 321981282.0, + "reward": 1.5647321939468384, + "reward_std": 0.24127903580665588, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5647321939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3975509703159332, + "step": 2707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1874.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 603.0, + "completions/mean_terminated_length": 603.0, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.794428681970596, + "grad_norm": 0.47553688287734985, + "kl": 0.143310546875, + "learning_rate": 1e-06, + "loss": 0.0418, + "num_tokens": 322115976.0, + "reward": 1.4941965341567993, + "reward_std": 0.1817176789045334, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49419641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.46074485778808594, + "step": 2708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2807.0, + "completions/max_terminated_length": 2807.0, + "completions/mean_length": 632.1339721679688, + "completions/mean_terminated_length": 632.1339721679688, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.7954604075315967, + "grad_norm": 0.6050577163696289, + "kl": 0.144775390625, + "learning_rate": 1e-06, + "loss": 0.0226, + "num_tokens": 322257377.0, + "reward": 1.5830357074737549, + "reward_std": 0.21893826127052307, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5830357670783997, + "rewards/curriculum_aware_reward_fn/std": 0.41765937209129333, + "step": 2709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2743.0, + "completions/max_terminated_length": 2743.0, + "completions/mean_length": 645.8928833007812, + "completions/mean_terminated_length": 645.8928833007812, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.796492133092597, + "grad_norm": 0.6692031025886536, + "kl": 0.175537109375, + "learning_rate": 1e-06, + "loss": 0.0363, + "num_tokens": 322396546.0, + "reward": 1.5839285850524902, + "reward_std": 0.2636502683162689, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5839285850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4007433354854584, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2234.0, + "completions/mean_length": 783.8928833007812, + "completions/mean_terminated_length": 754.0540771484375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 2.797523858653598, + "grad_norm": 0.5401923656463623, + "kl": 0.144287109375, + "learning_rate": 1e-06, + "loss": -0.0312, + "num_tokens": 322557464.0, + "reward": 1.485267996788025, + "reward_std": 0.230984166264534, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48526784777641296, + "rewards/curriculum_aware_reward_fn/std": 0.40673598647117615, + "step": 2711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2548.0, + "completions/mean_length": 667.419677734375, + "completions/mean_terminated_length": 605.081787109375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.798555584214599, + "grad_norm": 0.46403664350509644, + "kl": 0.1494140625, + "learning_rate": 1e-06, + "loss": 0.067, + "num_tokens": 322699170.0, + "reward": 1.650892972946167, + "reward_std": 0.172529399394989, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6598214507102966, + "rewards/curriculum_aware_reward_fn/std": 0.4270598590373993, + "step": 2712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3091.0, + "completions/mean_length": 759.669677734375, + "completions/mean_terminated_length": 729.6126098632812, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 2.7995873097755997, + "grad_norm": 0.5070902109146118, + "kl": 0.126708984375, + "learning_rate": 1e-06, + "loss": 0.0309, + "num_tokens": 322850298.0, + "reward": 1.4946428537368774, + "reward_std": 0.20876778662204742, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49464282393455505, + "rewards/curriculum_aware_reward_fn/std": 0.4294012784957886, + "step": 2713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2500.0, + "completions/max_terminated_length": 2500.0, + "completions/mean_length": 625.5267944335938, + "completions/mean_terminated_length": 625.5267944335938, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 2.8006190353366005, + "grad_norm": 0.4674634337425232, + "kl": 0.15185546875, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 322989685.0, + "reward": 1.5687501430511475, + "reward_std": 0.2000505030155182, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5687500238418579, + "rewards/curriculum_aware_reward_fn/std": 0.40576502680778503, + "step": 2714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2498.0, + "completions/max_terminated_length": 2498.0, + "completions/mean_length": 591.125, + "completions/mean_terminated_length": 591.125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 2.8016507608976013, + "grad_norm": 0.509829580783844, + "kl": 0.14306640625, + "learning_rate": 1e-06, + "loss": -0.0437, + "num_tokens": 323110288.0, + "reward": 1.641964316368103, + "reward_std": 0.15762221813201904, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.641964316368103, + "rewards/curriculum_aware_reward_fn/std": 0.4348321259021759, + "step": 2715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2325.0, + "completions/max_terminated_length": 2325.0, + "completions/mean_length": 638.8303833007812, + "completions/mean_terminated_length": 638.8303833007812, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.802682486458602, + "grad_norm": 0.4947524964809418, + "kl": 0.1376953125, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 323245845.0, + "reward": 1.6642857789993286, + "reward_std": 0.23866406083106995, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6642857193946838, + "rewards/curriculum_aware_reward_fn/std": 0.42524126172065735, + "step": 2716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1779.0, + "completions/mean_length": 684.0625610351562, + "completions/mean_terminated_length": 622.0272827148438, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 2.803714212019603, + "grad_norm": 0.5975044369697571, + "kl": 0.150634765625, + "learning_rate": 1e-06, + "loss": -0.0169, + "num_tokens": 323388323.0, + "reward": 1.5200893878936768, + "reward_std": 0.1250755488872528, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5200892686843872, + "rewards/curriculum_aware_reward_fn/std": 0.41948381066322327, + "step": 2717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3763.0, + "completions/max_terminated_length": 3763.0, + "completions/mean_length": 691.9553833007812, + "completions/mean_terminated_length": 691.9553833007812, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.8047459375806034, + "grad_norm": 0.5362421274185181, + "kl": 0.145751953125, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 323533204.0, + "reward": 1.5040180683135986, + "reward_std": 0.16446419060230255, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5040178298950195, + "rewards/curriculum_aware_reward_fn/std": 0.5325925946235657, + "step": 2718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2365.0, + "completions/mean_length": 581.232177734375, + "completions/mean_terminated_length": 549.5675659179688, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.8057776631416043, + "grad_norm": 0.6170505881309509, + "kl": 0.140869140625, + "learning_rate": 1e-06, + "loss": 0.0415, + "num_tokens": 323664172.0, + "reward": 1.7236608266830444, + "reward_std": 0.10211389511823654, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7236607670783997, + "rewards/curriculum_aware_reward_fn/std": 0.39392685890197754, + "step": 2719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1792.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 621.0625, + "completions/mean_terminated_length": 621.0625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 2.806809388702605, + "grad_norm": 0.5479264259338379, + "kl": 0.14306640625, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 323794162.0, + "reward": 1.52723228931427, + "reward_std": 0.2021634876728058, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5272321105003357, + "rewards/curriculum_aware_reward_fn/std": 0.4500194489955902, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1790.0, + "completions/mean_length": 690.6160888671875, + "completions/mean_terminated_length": 659.9369506835938, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.807841114263606, + "grad_norm": 0.6118156313896179, + "kl": 0.138916015625, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 323944052.0, + "reward": 1.6593750715255737, + "reward_std": 0.1973244845867157, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.659375011920929, + "rewards/curriculum_aware_reward_fn/std": 0.3662281334400177, + "step": 2721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2712.0, + "completions/max_terminated_length": 2712.0, + "completions/mean_length": 690.0000610351562, + "completions/mean_terminated_length": 690.0000610351562, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.808872839824607, + "grad_norm": 0.6139779090881348, + "kl": 0.12939453125, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 324089911.0, + "reward": 1.5500000715255737, + "reward_std": 0.2188665270805359, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.550000011920929, + "rewards/curriculum_aware_reward_fn/std": 0.40509146451950073, + "step": 2722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1743.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 628.9107666015625, + "completions/mean_terminated_length": 628.9107666015625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 2.809904565385607, + "grad_norm": 0.5854537487030029, + "kl": 0.143798828125, + "learning_rate": 1e-06, + "loss": 0.0433, + "num_tokens": 324235592.0, + "reward": 1.5468751192092896, + "reward_std": 0.190732941031456, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.546875, + "rewards/curriculum_aware_reward_fn/std": 0.41352590918540955, + "step": 2723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1709.0, + "completions/mean_length": 649.294677734375, + "completions/mean_terminated_length": 618.2432861328125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.810936290946608, + "grad_norm": 0.45837923884391785, + "kl": 0.123779296875, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 324377760.0, + "reward": 1.4794644117355347, + "reward_std": 0.1530054211616516, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4794642925262451, + "rewards/curriculum_aware_reward_fn/std": 0.5389159321784973, + "step": 2724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1929.0, + "completions/max_terminated_length": 1929.0, + "completions/mean_length": 619.857177734375, + "completions/mean_terminated_length": 619.857177734375, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 2.811968016507609, + "grad_norm": 0.617780327796936, + "kl": 0.1292724609375, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 324514489.0, + "reward": 1.368749976158142, + "reward_std": 0.15938755869865417, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36875003576278687, + "rewards/curriculum_aware_reward_fn/std": 0.39536306262016296, + "step": 2725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2127.0, + "completions/max_terminated_length": 2127.0, + "completions/mean_length": 636.9464721679688, + "completions/mean_terminated_length": 636.9464721679688, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 2.8129997420686097, + "grad_norm": 0.5487395524978638, + "kl": 0.1234130859375, + "learning_rate": 1e-06, + "loss": 0.0507, + "num_tokens": 324657129.0, + "reward": 1.4736608266830444, + "reward_std": 0.20129023492336273, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4736607074737549, + "rewards/curriculum_aware_reward_fn/std": 0.4590412974357605, + "step": 2726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1239.0, + "completions/max_terminated_length": 1239.0, + "completions/mean_length": 542.1964721679688, + "completions/mean_terminated_length": 542.1964721679688, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 2.8140314676296105, + "grad_norm": 0.5869016647338867, + "kl": 0.1339111328125, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 324778412.0, + "reward": 1.614285945892334, + "reward_std": 0.2042093425989151, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6142857670783997, + "rewards/curriculum_aware_reward_fn/std": 0.4096487760543823, + "step": 2727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3697.0, + "completions/mean_length": 686.5892944335938, + "completions/mean_terminated_length": 655.8739013671875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 2.8150631931906114, + "grad_norm": 0.5226927995681763, + "kl": 0.1175537109375, + "learning_rate": 1e-06, + "loss": 0.0266, + "num_tokens": 324929510.0, + "reward": 1.5303571224212646, + "reward_std": 0.15417543053627014, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5303571820259094, + "rewards/curriculum_aware_reward_fn/std": 0.44435179233551025, + "step": 2728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2069.0, + "completions/max_terminated_length": 2069.0, + "completions/mean_length": 589.0267944335938, + "completions/mean_terminated_length": 589.0267944335938, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 2.816094918751612, + "grad_norm": 0.7116428017616272, + "kl": 0.127197265625, + "learning_rate": 1e-06, + "loss": -0.0147, + "num_tokens": 325067392.0, + "reward": 1.4544644355773926, + "reward_std": 0.2100774347782135, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45446428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.40238887071609497, + "step": 2729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2507.0, + "completions/max_terminated_length": 2507.0, + "completions/mean_length": 517.875, + "completions/mean_terminated_length": 517.875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 2.817126644312613, + "grad_norm": 0.7094348073005676, + "kl": 0.14013671875, + "learning_rate": 1e-06, + "loss": 0.0525, + "num_tokens": 325193392.0, + "reward": 1.706696629524231, + "reward_std": 0.24004627764225006, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7066963911056519, + "rewards/curriculum_aware_reward_fn/std": 0.35083526372909546, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 482.6250305175781, + "completions/mean_terminated_length": 482.6250305175781, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 2.8181583698736135, + "grad_norm": 0.6465932726860046, + "kl": 0.1090087890625, + "learning_rate": 1e-06, + "loss": -0.0295, + "num_tokens": 325310677.0, + "reward": 1.5303571224212646, + "reward_std": 0.20120106637477875, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5303571820259094, + "rewards/curriculum_aware_reward_fn/std": 0.433680921792984, + "step": 2731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2721.0, + "completions/max_terminated_length": 2721.0, + "completions/mean_length": 642.7589721679688, + "completions/mean_terminated_length": 642.7589721679688, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 2.8191900954346143, + "grad_norm": 0.6267445683479309, + "kl": 0.1058349609375, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 325443650.0, + "reward": 1.4102680683135986, + "reward_std": 0.2142600417137146, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4102678596973419, + "rewards/curriculum_aware_reward_fn/std": 0.3987106680870056, + "step": 2732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1508.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 583.6964721679688, + "completions/mean_terminated_length": 583.6964721679688, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.820221820995615, + "grad_norm": 0.6053333282470703, + "kl": 0.1295166015625, + "learning_rate": 1e-06, + "loss": -0.0253, + "num_tokens": 325575566.0, + "reward": 1.549553632736206, + "reward_std": 0.23137836158275604, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5495535731315613, + "rewards/curriculum_aware_reward_fn/std": 0.401881605386734, + "step": 2733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2475.0, + "completions/max_terminated_length": 2475.0, + "completions/mean_length": 519.1875, + "completions/mean_terminated_length": 519.1875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 2.821253546556616, + "grad_norm": 0.6255543231964111, + "kl": 0.1456298828125, + "learning_rate": 1e-06, + "loss": -0.0323, + "num_tokens": 325691416.0, + "reward": 1.6580358743667603, + "reward_std": 0.20052222907543182, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6580356955528259, + "rewards/curriculum_aware_reward_fn/std": 0.4227512776851654, + "step": 2734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3016.0, + "completions/max_terminated_length": 3016.0, + "completions/mean_length": 611.3482666015625, + "completions/mean_terminated_length": 611.3482666015625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.822285272117617, + "grad_norm": 0.6658999919891357, + "kl": 0.1302490234375, + "learning_rate": 1e-06, + "loss": 0.0274, + "num_tokens": 325825764.0, + "reward": 1.5982145071029663, + "reward_std": 0.1547519564628601, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5982142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.40503188967704773, + "step": 2735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1929.0, + "completions/max_terminated_length": 1929.0, + "completions/mean_length": 587.8660888671875, + "completions/mean_terminated_length": 587.8660888671875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 2.823316997678617, + "grad_norm": 0.6943725943565369, + "kl": 0.133544921875, + "learning_rate": 1e-06, + "loss": 0.0474, + "num_tokens": 325953247.0, + "reward": 1.4250000715255737, + "reward_std": 0.24267423152923584, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42499998211860657, + "rewards/curriculum_aware_reward_fn/std": 0.38467252254486084, + "step": 2736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3335.0, + "completions/max_terminated_length": 3335.0, + "completions/mean_length": 592.232177734375, + "completions/mean_terminated_length": 592.232177734375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.824348723239618, + "grad_norm": 0.6189416646957397, + "kl": 0.1094970703125, + "learning_rate": 1e-06, + "loss": -0.0577, + "num_tokens": 326078839.0, + "reward": 1.5406252145767212, + "reward_std": 0.1452028453350067, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5406249761581421, + "rewards/curriculum_aware_reward_fn/std": 0.42982980608940125, + "step": 2737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3383.0, + "completions/max_terminated_length": 3383.0, + "completions/mean_length": 761.9375610351562, + "completions/mean_terminated_length": 761.9375610351562, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.825380448800619, + "grad_norm": 0.5241042375564575, + "kl": 0.1085205078125, + "learning_rate": 1e-06, + "loss": 0.0348, + "num_tokens": 326234085.0, + "reward": 1.5888394117355347, + "reward_std": 0.21741487085819244, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5888392329216003, + "rewards/curriculum_aware_reward_fn/std": 0.4007149934768677, + "step": 2738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 574.2767944335938, + "completions/mean_terminated_length": 542.549560546875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 2.8264121743616197, + "grad_norm": 0.6535649299621582, + "kl": 0.12548828125, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 326371708.0, + "reward": 1.5611608028411865, + "reward_std": 0.1966494917869568, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5700892806053162, + "rewards/curriculum_aware_reward_fn/std": 0.42066332697868347, + "step": 2739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1909.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 550.9642944335938, + "completions/mean_terminated_length": 550.9642944335938, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 2.8274438999226206, + "grad_norm": 0.599125325679779, + "kl": 0.1212158203125, + "learning_rate": 1e-06, + "loss": 0.0339, + "num_tokens": 326502156.0, + "reward": 1.4745535850524902, + "reward_std": 0.17932043969631195, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47455355525016785, + "rewards/curriculum_aware_reward_fn/std": 0.436251163482666, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2128.0, + "completions/mean_length": 633.0267944335938, + "completions/mean_terminated_length": 601.828857421875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 2.8284756254836214, + "grad_norm": 0.6590193510055542, + "kl": 0.14404296875, + "learning_rate": 1e-06, + "loss": 0.0427, + "num_tokens": 326646660.0, + "reward": 1.5750000476837158, + "reward_std": 0.12754014134407043, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5839285254478455, + "rewards/curriculum_aware_reward_fn/std": 0.4407821595668793, + "step": 2741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2685.0, + "completions/max_terminated_length": 2685.0, + "completions/mean_length": 577.6607666015625, + "completions/mean_terminated_length": 577.6607666015625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 2.8295073510446223, + "grad_norm": 0.5379606485366821, + "kl": 0.118896484375, + "learning_rate": 1e-06, + "loss": -0.0226, + "num_tokens": 326771250.0, + "reward": 1.4455357789993286, + "reward_std": 0.13722234964370728, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44553571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.44935157895088196, + "step": 2742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1714.0, + "completions/max_terminated_length": 1714.0, + "completions/mean_length": 644.419677734375, + "completions/mean_terminated_length": 644.419677734375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 2.830539076605623, + "grad_norm": 0.5652866959571838, + "kl": 0.1173095703125, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 326917751.0, + "reward": 1.4107143878936768, + "reward_std": 0.21303237974643707, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096, + "rewards/curriculum_aware_reward_fn/std": 0.3831724226474762, + "step": 2743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2476.0, + "completions/max_terminated_length": 2476.0, + "completions/mean_length": 647.4642944335938, + "completions/mean_terminated_length": 647.4642944335938, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 2.831570802166624, + "grad_norm": 0.5718545317649841, + "kl": 0.1138916015625, + "learning_rate": 1e-06, + "loss": 0.0321, + "num_tokens": 327065461.0, + "reward": 1.4526787996292114, + "reward_std": 0.25439465045928955, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45267853140830994, + "rewards/curriculum_aware_reward_fn/std": 0.43443238735198975, + "step": 2744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1646.0, + "completions/mean_length": 669.2410888671875, + "completions/mean_terminated_length": 606.9363403320312, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 2.8326025277276243, + "grad_norm": 0.6078090667724609, + "kl": 0.1171875, + "learning_rate": 1e-06, + "loss": 0.0352, + "num_tokens": 327200042.0, + "reward": 1.380357265472412, + "reward_std": 0.20888622105121613, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.38928571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.3939536511898041, + "step": 2745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1619.0, + "completions/max_terminated_length": 1619.0, + "completions/mean_length": 590.4285888671875, + "completions/mean_terminated_length": 590.4285888671875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 2.833634253288625, + "grad_norm": 0.5378431081771851, + "kl": 0.120361328125, + "learning_rate": 1e-06, + "loss": -0.0286, + "num_tokens": 327332831.0, + "reward": 1.627232313156128, + "reward_std": 0.18446555733680725, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6272321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.38237571716308594, + "step": 2746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2641.0, + "completions/max_terminated_length": 2641.0, + "completions/mean_length": 603.3482666015625, + "completions/mean_terminated_length": 603.3482666015625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 2.834665978849626, + "grad_norm": 0.6392841935157776, + "kl": 0.1219482421875, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 327466857.0, + "reward": 1.5977680683135986, + "reward_std": 0.24501563608646393, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5977678298950195, + "rewards/curriculum_aware_reward_fn/std": 0.39044836163520813, + "step": 2747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2207.0, + "completions/max_terminated_length": 2207.0, + "completions/mean_length": 645.875, + "completions/mean_terminated_length": 645.875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 2.835697704410627, + "grad_norm": 0.533803403377533, + "kl": 0.125244140625, + "learning_rate": 1e-06, + "loss": -0.0215, + "num_tokens": 327609580.0, + "reward": 1.5861607789993286, + "reward_std": 0.13563905656337738, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838, + "rewards/curriculum_aware_reward_fn/std": 0.4360557198524475, + "step": 2748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2596.0, + "completions/max_terminated_length": 2596.0, + "completions/mean_length": 657.2589721679688, + "completions/mean_terminated_length": 657.2589721679688, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.8367294299716277, + "grad_norm": 0.5073429942131042, + "kl": 0.1175537109375, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 327750904.0, + "reward": 1.4535715579986572, + "reward_std": 0.13991807401180267, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4535714089870453, + "rewards/curriculum_aware_reward_fn/std": 0.44024530053138733, + "step": 2749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1730.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 606.3482666015625, + "completions/mean_terminated_length": 606.3482666015625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 2.837761155532628, + "grad_norm": 0.6113404035568237, + "kl": 0.11865234375, + "learning_rate": 1e-06, + "loss": 0.0395, + "num_tokens": 327879911.0, + "reward": 1.4343750476837158, + "reward_std": 0.21782106161117554, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43437501788139343, + "rewards/curriculum_aware_reward_fn/std": 0.41220298409461975, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1860.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 582.669677734375, + "completions/mean_terminated_length": 582.669677734375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.838792881093629, + "grad_norm": 0.6139469146728516, + "kl": 0.1182861328125, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 328012209.0, + "reward": 1.520535945892334, + "reward_std": 0.25029444694519043, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5205357670783997, + "rewards/curriculum_aware_reward_fn/std": 0.4264642596244812, + "step": 2751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2621.0, + "completions/max_terminated_length": 2621.0, + "completions/mean_length": 575.5803833007812, + "completions/mean_terminated_length": 575.5803833007812, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 2.8398246066546298, + "grad_norm": 0.6015924215316772, + "kl": 0.1124267578125, + "learning_rate": 1e-06, + "loss": 0.0346, + "num_tokens": 328144476.0, + "reward": 1.5433037281036377, + "reward_std": 0.198805034160614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5433035492897034, + "rewards/curriculum_aware_reward_fn/std": 0.406728059053421, + "step": 2752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2968.0, + "completions/mean_length": 690.5089721679688, + "completions/mean_terminated_length": 659.828857421875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 2.8408563322156306, + "grad_norm": 0.6319563984870911, + "kl": 0.1114501953125, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 328289587.0, + "reward": 1.383928656578064, + "reward_std": 0.24402377009391785, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4017857015132904, + "rewards/curriculum_aware_reward_fn/std": 0.4391293525695801, + "step": 2753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1251.0, + "completions/max_terminated_length": 1251.0, + "completions/mean_length": 524.125, + "completions/mean_terminated_length": 524.125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 2.8418880577766314, + "grad_norm": 0.5966294407844543, + "kl": 0.1282958984375, + "learning_rate": 1e-06, + "loss": 0.0474, + "num_tokens": 328418450.0, + "reward": 1.462053656578064, + "reward_std": 0.16561885178089142, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4709821343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4223293662071228, + "step": 2754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2748.0, + "completions/max_terminated_length": 2748.0, + "completions/mean_length": 646.3482666015625, + "completions/mean_terminated_length": 646.3482666015625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.8429197833376323, + "grad_norm": 0.5699669122695923, + "kl": 0.1124267578125, + "learning_rate": 1e-06, + "loss": -0.0129, + "num_tokens": 328553904.0, + "reward": 1.5258928537368774, + "reward_std": 0.23065383732318878, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5348213911056519, + "rewards/curriculum_aware_reward_fn/std": 0.43219736218452454, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1848.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 570.794677734375, + "completions/mean_terminated_length": 570.794677734375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 2.843951508898633, + "grad_norm": 0.5182858109474182, + "kl": 0.1142578125, + "learning_rate": 1e-06, + "loss": 0.0274, + "num_tokens": 328687413.0, + "reward": 1.6473214626312256, + "reward_std": 0.20871160924434662, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6473214030265808, + "rewards/curriculum_aware_reward_fn/std": 0.5464397072792053, + "step": 2756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1819.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 706.3839721679688, + "completions/mean_terminated_length": 706.3839721679688, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 2.844983234459634, + "grad_norm": 0.5897096991539001, + "kl": 0.1068115234375, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 328831369.0, + "reward": 1.379910945892334, + "reward_std": 0.20769324898719788, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3799107074737549, + "rewards/curriculum_aware_reward_fn/std": 0.3808158338069916, + "step": 2757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3134.0, + "completions/mean_length": 649.4553833007812, + "completions/mean_terminated_length": 618.4053955078125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 2.8460149600206344, + "grad_norm": 0.6046122312545776, + "kl": 0.1082763671875, + "learning_rate": 1e-06, + "loss": 0.0676, + "num_tokens": 328976761.0, + "reward": 1.5232144594192505, + "reward_std": 0.255319744348526, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5410714745521545, + "rewards/curriculum_aware_reward_fn/std": 0.44656920433044434, + "step": 2758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1459.0, + "completions/mean_length": 615.5089721679688, + "completions/mean_terminated_length": 552.2272338867188, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.847046685581635, + "grad_norm": 0.6533037424087524, + "kl": 0.11572265625, + "learning_rate": 1e-06, + "loss": -0.0589, + "num_tokens": 329112086.0, + "reward": 1.6267858743667603, + "reward_std": 0.24057099223136902, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6267856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.3920912444591522, + "step": 2759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2279.0, + "completions/mean_length": 679.419677734375, + "completions/mean_terminated_length": 648.6396484375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 2.848078411142636, + "grad_norm": 0.6277405619621277, + "kl": 0.1229248046875, + "learning_rate": 1e-06, + "loss": 0.0315, + "num_tokens": 329255815.0, + "reward": 1.4151787757873535, + "reward_std": 0.23310363292694092, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4151785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.4091762602329254, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3006.0, + "completions/max_terminated_length": 3006.0, + "completions/mean_length": 806.8750610351562, + "completions/mean_terminated_length": 806.8750610351562, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 2.849110136703637, + "grad_norm": 0.5804691910743713, + "kl": 0.1204833984375, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 329422254.0, + "reward": 1.368303656578064, + "reward_std": 0.15276534855365753, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3683035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.3609294593334198, + "step": 2761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2329.0, + "completions/max_terminated_length": 2329.0, + "completions/mean_length": 783.7857666015625, + "completions/mean_terminated_length": 783.7857666015625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 2.8501418622646377, + "grad_norm": 0.5310266017913818, + "kl": 0.1234130859375, + "learning_rate": 1e-06, + "loss": 0.0319, + "num_tokens": 329580689.0, + "reward": 1.231696605682373, + "reward_std": 0.16519694030284882, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.24955357611179352, + "rewards/curriculum_aware_reward_fn/std": 0.34896859526634216, + "step": 2762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3746.0, + "completions/max_terminated_length": 3746.0, + "completions/mean_length": 704.6964721679688, + "completions/mean_terminated_length": 704.6964721679688, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 2.851173587825638, + "grad_norm": 0.5648817420005798, + "kl": 0.1199951171875, + "learning_rate": 1e-06, + "loss": -0.0221, + "num_tokens": 329719689.0, + "reward": 1.5000001192092896, + "reward_std": 0.23215486109256744, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5, + "rewards/curriculum_aware_reward_fn/std": 0.43137115240097046, + "step": 2763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3582.0, + "completions/max_terminated_length": 3582.0, + "completions/mean_length": 713.9107666015625, + "completions/mean_terminated_length": 713.9107666015625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 2.852205313386639, + "grad_norm": 0.5491836667060852, + "kl": 0.117919921875, + "learning_rate": 1e-06, + "loss": -0.0367, + "num_tokens": 329872860.0, + "reward": 1.566517949104309, + "reward_std": 0.15733234584331512, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5665178894996643, + "rewards/curriculum_aware_reward_fn/std": 0.40516865253448486, + "step": 2764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2924.0, + "completions/max_terminated_length": 2924.0, + "completions/mean_length": 811.982177734375, + "completions/mean_terminated_length": 811.982177734375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 2.85323703894764, + "grad_norm": 0.5808110237121582, + "kl": 0.1220703125, + "learning_rate": 1e-06, + "loss": 0.0553, + "num_tokens": 330039416.0, + "reward": 1.4968751668930054, + "reward_std": 0.24768134951591492, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5058035850524902, + "rewards/curriculum_aware_reward_fn/std": 0.42789793014526367, + "step": 2765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3141.0, + "completions/max_terminated_length": 3141.0, + "completions/mean_length": 748.7232666015625, + "completions/mean_terminated_length": 748.7232666015625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 2.8542687645086406, + "grad_norm": 0.4694342017173767, + "kl": 0.118408203125, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 330197711.0, + "reward": 1.4142857789993286, + "reward_std": 0.12062786519527435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.41428571939468384, + "rewards/curriculum_aware_reward_fn/std": 0.5386180281639099, + "step": 2766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2854.0, + "completions/max_terminated_length": 2854.0, + "completions/mean_length": 840.2053833007812, + "completions/mean_terminated_length": 840.2053833007812, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.8553004900696415, + "grad_norm": 0.5480297803878784, + "kl": 0.1085205078125, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 330355913.0, + "reward": 1.4700894355773926, + "reward_std": 0.1603080928325653, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.47901788353919983, + "rewards/curriculum_aware_reward_fn/std": 0.4383958876132965, + "step": 2767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3313.0, + "completions/max_terminated_length": 3313.0, + "completions/mean_length": 571.232177734375, + "completions/mean_terminated_length": 571.232177734375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 2.8563322156306423, + "grad_norm": 0.49730056524276733, + "kl": 0.1295166015625, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 330470935.0, + "reward": 1.669196605682373, + "reward_std": 0.12415627390146255, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6691964268684387, + "rewards/curriculum_aware_reward_fn/std": 0.41395941376686096, + "step": 2768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3258.0, + "completions/max_terminated_length": 3258.0, + "completions/mean_length": 697.1250610351562, + "completions/mean_terminated_length": 697.1250610351562, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 2.857363941191643, + "grad_norm": 0.5766168236732483, + "kl": 0.1219482421875, + "learning_rate": 1e-06, + "loss": 0.0916, + "num_tokens": 330613336.0, + "reward": 1.6540179252624512, + "reward_std": 0.22875763475894928, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6540178656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4130276143550873, + "step": 2769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2671.0, + "completions/mean_length": 701.9464721679688, + "completions/mean_terminated_length": 671.369384765625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.858395666752644, + "grad_norm": 0.5088793039321899, + "kl": 0.1124267578125, + "learning_rate": 1e-06, + "loss": 0.0451, + "num_tokens": 330753596.0, + "reward": 1.5861607789993286, + "reward_std": 0.17370003461837769, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5861607193946838, + "rewards/curriculum_aware_reward_fn/std": 0.5460014343261719, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2800.0, + "completions/max_terminated_length": 2800.0, + "completions/mean_length": 639.9285888671875, + "completions/mean_terminated_length": 639.9285888671875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 2.859427392313645, + "grad_norm": 0.6735429763793945, + "kl": 0.1207275390625, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 330890606.0, + "reward": 1.4941965341567993, + "reward_std": 0.1921509951353073, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49419641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.4499121606349945, + "step": 2771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4082.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 805.8928833007812, + "completions/mean_terminated_length": 805.8928833007812, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 2.8604591178746452, + "grad_norm": 0.5096302628517151, + "kl": 0.111328125, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 331049949.0, + "reward": 1.5459821224212646, + "reward_std": 0.17727798223495483, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5549107193946838, + "rewards/curriculum_aware_reward_fn/std": 0.42964261770248413, + "step": 2772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3661.0, + "completions/max_terminated_length": 3661.0, + "completions/mean_length": 619.107177734375, + "completions/mean_terminated_length": 619.107177734375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 2.861490843435646, + "grad_norm": 0.5546408891677856, + "kl": 0.125, + "learning_rate": 1e-06, + "loss": 0.0497, + "num_tokens": 331189877.0, + "reward": 1.5205358266830444, + "reward_std": 0.09648904204368591, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5205357670783997, + "rewards/curriculum_aware_reward_fn/std": 0.4294640123844147, + "step": 2773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3947.0, + "completions/mean_length": 982.4732666015625, + "completions/mean_terminated_length": 954.4234619140625, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 2.862522568996647, + "grad_norm": 0.5640942454338074, + "kl": 0.1187744140625, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 331377719.0, + "reward": 1.2647321224212646, + "reward_std": 0.21712327003479004, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.27366071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3806679844856262, + "step": 2774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1928.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 547.6428833007812, + "completions/mean_terminated_length": 547.6428833007812, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.8635542945576478, + "grad_norm": 0.5652421116828918, + "kl": 0.122802734375, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 331505114.0, + "reward": 1.849107265472412, + "reward_std": 0.19259557127952576, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.858035683631897, + "rewards/curriculum_aware_reward_fn/std": 0.3003416657447815, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3592.0, + "completions/mean_length": 880.3839721679688, + "completions/mean_terminated_length": 851.4144287109375, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 2.8645860201186486, + "grad_norm": 0.5577312111854553, + "kl": 0.1126708984375, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 331668782.0, + "reward": 1.4937502145767212, + "reward_std": 0.21671874821186066, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4937500059604645, + "rewards/curriculum_aware_reward_fn/std": 0.4100167453289032, + "step": 2776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2131.0, + "completions/max_terminated_length": 2131.0, + "completions/mean_length": 712.4285888671875, + "completions/mean_terminated_length": 712.4285888671875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.865617745679649, + "grad_norm": 0.49200648069381714, + "kl": 0.12451171875, + "learning_rate": 1e-06, + "loss": -0.0173, + "num_tokens": 331810006.0, + "reward": 1.5785716772079468, + "reward_std": 0.16207048296928406, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5785713791847229, + "rewards/curriculum_aware_reward_fn/std": 0.4149087369441986, + "step": 2777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2793.0, + "completions/max_terminated_length": 2793.0, + "completions/mean_length": 710.1517944335938, + "completions/mean_terminated_length": 710.1517944335938, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 2.86664947124065, + "grad_norm": 0.5689948797225952, + "kl": 0.1351318359375, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 331957618.0, + "reward": 1.630357265472412, + "reward_std": 0.22884601354599, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6303571462631226, + "rewards/curriculum_aware_reward_fn/std": 0.40131300687789917, + "step": 2778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2850.0, + "completions/mean_length": 929.6160888671875, + "completions/mean_terminated_length": 872.04541015625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 2.8676811968016507, + "grad_norm": 0.4698331356048584, + "kl": 0.1124267578125, + "learning_rate": 1e-06, + "loss": -0.0131, + "num_tokens": 332132715.0, + "reward": 1.5125000476837158, + "reward_std": 0.19689759612083435, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.512499988079071, + "rewards/curriculum_aware_reward_fn/std": 0.39045479893684387, + "step": 2779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2132.0, + "completions/max_terminated_length": 2132.0, + "completions/mean_length": 624.1607666015625, + "completions/mean_terminated_length": 624.1607666015625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.8687129223626515, + "grad_norm": 0.560449481010437, + "kl": 0.1348876953125, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 332272866.0, + "reward": 1.7741073369979858, + "reward_std": 0.14867447316646576, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7741070985794067, + "rewards/curriculum_aware_reward_fn/std": 0.36304405331611633, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2469.0, + "completions/max_terminated_length": 2469.0, + "completions/mean_length": 788.3482666015625, + "completions/mean_terminated_length": 788.3482666015625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 2.8697446479236524, + "grad_norm": 0.6889148354530334, + "kl": 0.1185302734375, + "learning_rate": 1e-06, + "loss": 0.028, + "num_tokens": 332425309.0, + "reward": 1.471428632736206, + "reward_std": 0.25298944115638733, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4714285731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4599605202674866, + "step": 2781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3227.0, + "completions/mean_length": 819.2767944335938, + "completions/mean_terminated_length": 789.7567749023438, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 2.870776373484653, + "grad_norm": 0.5370697975158691, + "kl": 0.124267578125, + "learning_rate": 1e-06, + "loss": 0.0406, + "num_tokens": 332581454.0, + "reward": 1.4379466772079468, + "reward_std": 0.24798668920993805, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4379464089870453, + "rewards/curriculum_aware_reward_fn/std": 0.43724387884140015, + "step": 2782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3573.0, + "completions/max_terminated_length": 3573.0, + "completions/mean_length": 672.4375, + "completions/mean_terminated_length": 672.4375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 2.871808099045654, + "grad_norm": 0.5681089758872986, + "kl": 0.1175537109375, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 332718437.0, + "reward": 1.7406251430511475, + "reward_std": 0.12895211577415466, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7406249642372131, + "rewards/curriculum_aware_reward_fn/std": 0.37141871452331543, + "step": 2783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3656.0, + "completions/max_terminated_length": 3656.0, + "completions/mean_length": 722.4732666015625, + "completions/mean_terminated_length": 722.4732666015625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 2.872839824606655, + "grad_norm": 0.4188723862171173, + "kl": 0.1199951171875, + "learning_rate": 1e-06, + "loss": 0.055, + "num_tokens": 332865182.0, + "reward": 1.532589316368103, + "reward_std": 0.09648028761148453, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.532589316368103, + "rewards/curriculum_aware_reward_fn/std": 0.45031070709228516, + "step": 2784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2457.0, + "completions/max_terminated_length": 2457.0, + "completions/mean_length": 710.6339721679688, + "completions/mean_terminated_length": 710.6339721679688, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.8738715501676553, + "grad_norm": 0.6341184973716736, + "kl": 0.1148681640625, + "learning_rate": 1e-06, + "loss": 0.034, + "num_tokens": 333005939.0, + "reward": 1.5611608028411865, + "reward_std": 0.2271462231874466, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5611607432365417, + "rewards/curriculum_aware_reward_fn/std": 0.4104880392551422, + "step": 2785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4080.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 785.3482666015625, + "completions/mean_terminated_length": 785.3482666015625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 2.874903275728656, + "grad_norm": 0.5458928942680359, + "kl": 0.1356201171875, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 333163331.0, + "reward": 1.3892858028411865, + "reward_std": 0.22566410899162292, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.39821428060531616, + "rewards/curriculum_aware_reward_fn/std": 0.43308696150779724, + "step": 2786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2467.0, + "completions/max_terminated_length": 2467.0, + "completions/mean_length": 808.5535888671875, + "completions/mean_terminated_length": 808.5535888671875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.875935001289657, + "grad_norm": 0.6074646711349487, + "kl": 0.12451171875, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 333327725.0, + "reward": 1.5156251192092896, + "reward_std": 0.21479451656341553, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5245535969734192, + "rewards/curriculum_aware_reward_fn/std": 0.41708964109420776, + "step": 2787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2659.0, + "completions/mean_length": 827.1964721679688, + "completions/mean_terminated_length": 767.7636108398438, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 2.876966726850658, + "grad_norm": 0.5130965113639832, + "kl": 0.1142578125, + "learning_rate": 1e-06, + "loss": 0.0503, + "num_tokens": 333495621.0, + "reward": 1.5142858028411865, + "reward_std": 0.22739137709140778, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5142857432365417, + "rewards/curriculum_aware_reward_fn/std": 0.43472206592559814, + "step": 2788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2858.0, + "completions/mean_length": 929.4732666015625, + "completions/mean_terminated_length": 900.9459838867188, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 2.8779984524116586, + "grad_norm": 0.5246358513832092, + "kl": 0.1146240234375, + "learning_rate": 1e-06, + "loss": -0.0464, + "num_tokens": 333678829.0, + "reward": 1.4263393878936768, + "reward_std": 0.26658251881599426, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4352678656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4434019923210144, + "step": 2789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1959.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 650.1428833007812, + "completions/mean_terminated_length": 650.1428833007812, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 2.879030177972659, + "grad_norm": 0.6150388121604919, + "kl": 0.1239013671875, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 333819846.0, + "reward": 1.581696629524231, + "reward_std": 0.2450869381427765, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5816964507102966, + "rewards/curriculum_aware_reward_fn/std": 0.42833197116851807, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2465.0, + "completions/mean_length": 782.1964721679688, + "completions/mean_terminated_length": 752.3423461914062, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 2.88006190353366, + "grad_norm": 0.47666504979133606, + "kl": 0.1256103515625, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 333977641.0, + "reward": 1.5531251430511475, + "reward_std": 0.1824059635400772, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5531250238418579, + "rewards/curriculum_aware_reward_fn/std": 0.3910762071609497, + "step": 2791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3040.0, + "completions/max_terminated_length": 3040.0, + "completions/mean_length": 796.982177734375, + "completions/mean_terminated_length": 796.982177734375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 2.8810936290946607, + "grad_norm": 0.5374991297721863, + "kl": 0.1209716796875, + "learning_rate": 1e-06, + "loss": 0.0343, + "num_tokens": 334127880.0, + "reward": 1.5241073369979858, + "reward_std": 0.22164270281791687, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5241070985794067, + "rewards/curriculum_aware_reward_fn/std": 0.4269091784954071, + "step": 2792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3287.0, + "completions/max_terminated_length": 3287.0, + "completions/mean_length": 786.6250610351562, + "completions/mean_terminated_length": 786.6250610351562, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 2.8821253546556616, + "grad_norm": 0.5066479444503784, + "kl": 0.1220703125, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 334284099.0, + "reward": 1.5000001192092896, + "reward_std": 0.1792917400598526, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5, + "rewards/curriculum_aware_reward_fn/std": 0.4358898997306824, + "step": 2793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2175.0, + "completions/max_terminated_length": 2175.0, + "completions/mean_length": 705.2232666015625, + "completions/mean_terminated_length": 705.2232666015625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.8831570802166624, + "grad_norm": 0.6270851492881775, + "kl": 0.120361328125, + "learning_rate": 1e-06, + "loss": 0.0432, + "num_tokens": 334421051.0, + "reward": 1.5513395071029663, + "reward_std": 0.21944689750671387, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5513392686843872, + "rewards/curriculum_aware_reward_fn/std": 0.3947550058364868, + "step": 2794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3129.0, + "completions/max_terminated_length": 3129.0, + "completions/mean_length": 823.3392944335938, + "completions/mean_terminated_length": 823.3392944335938, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 2.8841888057776632, + "grad_norm": 0.5626925826072693, + "kl": 0.1043701171875, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 334585281.0, + "reward": 1.4611607789993286, + "reward_std": 0.11960483342409134, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.47008928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4516465961933136, + "step": 2795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2989.0, + "completions/max_terminated_length": 2989.0, + "completions/mean_length": 820.4464721679688, + "completions/mean_terminated_length": 820.4464721679688, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.885220531338664, + "grad_norm": 0.5923090577125549, + "kl": 0.1287841796875, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 334744364.0, + "reward": 1.5718750953674316, + "reward_std": 0.3103684186935425, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5718750357627869, + "rewards/curriculum_aware_reward_fn/std": 0.4846648871898651, + "step": 2796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3144.0, + "completions/mean_length": 881.0714721679688, + "completions/mean_terminated_length": 852.108154296875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 2.886252256899665, + "grad_norm": 0.53386390209198, + "kl": 0.120849609375, + "learning_rate": 1e-06, + "loss": -0.0667, + "num_tokens": 334909243.0, + "reward": 1.4450894594192505, + "reward_std": 0.16090065240859985, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.44508931040763855, + "rewards/curriculum_aware_reward_fn/std": 0.4154103398323059, + "step": 2797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3976.0, + "completions/max_terminated_length": 3976.0, + "completions/mean_length": 712.8750610351562, + "completions/mean_terminated_length": 712.8750610351562, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 2.8872839824606653, + "grad_norm": 0.6113415956497192, + "kl": 0.1234130859375, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 335048200.0, + "reward": 1.5406252145767212, + "reward_std": 0.16090570390224457, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5495535731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4137009382247925, + "step": 2798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3375.0, + "completions/mean_length": 783.0267944335938, + "completions/mean_terminated_length": 753.18017578125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 2.888315708021666, + "grad_norm": 0.5956709384918213, + "kl": 0.1158447265625, + "learning_rate": 1e-06, + "loss": 0.0327, + "num_tokens": 335197856.0, + "reward": 1.5160715579986572, + "reward_std": 0.17236091196537018, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5249999761581421, + "rewards/curriculum_aware_reward_fn/std": 0.4052026569843292, + "step": 2799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1861.0, + "completions/max_terminated_length": 1861.0, + "completions/mean_length": 654.1964721679688, + "completions/mean_terminated_length": 654.1964721679688, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 2.889347433582667, + "grad_norm": 0.6350661516189575, + "kl": 0.131103515625, + "learning_rate": 1e-06, + "loss": -0.0495, + "num_tokens": 335340475.0, + "reward": 1.6669644117355347, + "reward_std": 0.2553524374961853, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6669642329216003, + "rewards/curriculum_aware_reward_fn/std": 0.5253721475601196, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2500.0, + "completions/mean_length": 702.3928833007812, + "completions/mean_terminated_length": 671.81982421875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 2.890379159143668, + "grad_norm": 0.49716106057167053, + "kl": 0.1234130859375, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 335487652.0, + "reward": 1.5357143878936768, + "reward_std": 0.1621207445859909, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5357142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.4404353201389313, + "step": 2801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2283.0, + "completions/max_terminated_length": 2283.0, + "completions/mean_length": 720.2767944335938, + "completions/mean_terminated_length": 720.2767944335938, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 2.8914108847046687, + "grad_norm": 0.5649811625480652, + "kl": 0.1072998046875, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 335639044.0, + "reward": 1.4316965341567993, + "reward_std": 0.16086453199386597, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43169641494750977, + "rewards/curriculum_aware_reward_fn/std": 0.45564502477645874, + "step": 2802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2778.0, + "completions/max_terminated_length": 2778.0, + "completions/mean_length": 695.732177734375, + "completions/mean_terminated_length": 695.732177734375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 2.892442610265669, + "grad_norm": 0.5937812328338623, + "kl": 0.13037109375, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 335782052.0, + "reward": 1.5696429014205933, + "reward_std": 0.26064127683639526, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5785714387893677, + "rewards/curriculum_aware_reward_fn/std": 0.4103785455226898, + "step": 2803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1304.0, + "completions/max_terminated_length": 1304.0, + "completions/mean_length": 562.3125, + "completions/mean_terminated_length": 562.3125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 2.89347433582667, + "grad_norm": 0.6014729142189026, + "kl": 0.142822265625, + "learning_rate": 1e-06, + "loss": -0.0243, + "num_tokens": 335903824.0, + "reward": 1.4691965579986572, + "reward_std": 0.17668238282203674, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46919646859169006, + "rewards/curriculum_aware_reward_fn/std": 0.4110109210014343, + "step": 2804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1903.0, + "completions/mean_length": 748.5803833007812, + "completions/mean_terminated_length": 718.4234619140625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 2.8945060613876707, + "grad_norm": 0.5668159127235413, + "kl": 0.12548828125, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 336058855.0, + "reward": 1.516964316368103, + "reward_std": 0.16923758387565613, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5169642567634583, + "rewards/curriculum_aware_reward_fn/std": 0.3730536997318268, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2043.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 551.2589721679688, + "completions/mean_terminated_length": 551.2589721679688, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 2.8955377869486716, + "grad_norm": 0.603298544883728, + "kl": 0.13525390625, + "learning_rate": 1e-06, + "loss": 0.0425, + "num_tokens": 336180509.0, + "reward": 1.6491073369979858, + "reward_std": 0.14685435593128204, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6491071581840515, + "rewards/curriculum_aware_reward_fn/std": 0.40730834007263184, + "step": 2806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2774.0, + "completions/mean_length": 750.1785888671875, + "completions/mean_terminated_length": 720.0360717773438, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 2.8965695125096724, + "grad_norm": 0.6138197779655457, + "kl": 0.1397705078125, + "learning_rate": 1e-06, + "loss": 0.0226, + "num_tokens": 336322209.0, + "reward": 1.5781251192092896, + "reward_std": 0.23737137019634247, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.578125, + "rewards/curriculum_aware_reward_fn/std": 0.40829136967658997, + "step": 2807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3321.0, + "completions/max_terminated_length": 3321.0, + "completions/mean_length": 701.794677734375, + "completions/mean_terminated_length": 701.794677734375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.8976012380706733, + "grad_norm": 0.4589069187641144, + "kl": 0.11572265625, + "learning_rate": 1e-06, + "loss": 0.0257, + "num_tokens": 336471041.0, + "reward": 1.4861607551574707, + "reward_std": 0.17394407093524933, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4950892925262451, + "rewards/curriculum_aware_reward_fn/std": 0.4386177957057953, + "step": 2808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2001.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 665.9017944335938, + "completions/mean_terminated_length": 665.9017944335938, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 2.898632963631674, + "grad_norm": 0.5997697710990906, + "kl": 0.128173828125, + "learning_rate": 1e-06, + "loss": 0.012, + "num_tokens": 336614026.0, + "reward": 1.4651787281036377, + "reward_std": 0.25731945037841797, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46517857909202576, + "rewards/curriculum_aware_reward_fn/std": 0.41928935050964355, + "step": 2809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1818.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 557.6160888671875, + "completions/mean_terminated_length": 557.6160888671875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 2.899664689192675, + "grad_norm": 0.6498278975486755, + "kl": 0.144287109375, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 336741677.0, + "reward": 1.740625023841858, + "reward_std": 0.16024711728096008, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7406249642372131, + "rewards/curriculum_aware_reward_fn/std": 0.34838956594467163, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3370.0, + "completions/max_terminated_length": 3370.0, + "completions/mean_length": 698.1964721679688, + "completions/mean_terminated_length": 698.1964721679688, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 2.900696414753676, + "grad_norm": 0.5694387555122375, + "kl": 0.121337890625, + "learning_rate": 1e-06, + "loss": 0.0339, + "num_tokens": 336887621.0, + "reward": 1.4205358028411865, + "reward_std": 0.21582452952861786, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42053571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.425618439912796, + "step": 2811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2319.0, + "completions/max_terminated_length": 2319.0, + "completions/mean_length": 653.6785888671875, + "completions/mean_terminated_length": 653.6785888671875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 2.901728140314676, + "grad_norm": 0.6535736918449402, + "kl": 0.1265869140625, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 337025618.0, + "reward": 1.5375001430511475, + "reward_std": 0.19925980269908905, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5374999642372131, + "rewards/curriculum_aware_reward_fn/std": 0.4090474545955658, + "step": 2812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2772.0, + "completions/max_terminated_length": 2772.0, + "completions/mean_length": 771.8303833007812, + "completions/mean_terminated_length": 771.8303833007812, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 2.902759865875677, + "grad_norm": 0.44244030117988586, + "kl": 0.116455078125, + "learning_rate": 1e-06, + "loss": -0.042, + "num_tokens": 337180082.0, + "reward": 1.5151785612106323, + "reward_std": 0.20895953476428986, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5241071581840515, + "rewards/curriculum_aware_reward_fn/std": 0.4499240219593048, + "step": 2813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3413.0, + "completions/max_terminated_length": 3413.0, + "completions/mean_length": 795.6160888671875, + "completions/mean_terminated_length": 795.6160888671875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 2.903791591436678, + "grad_norm": 0.4640490412712097, + "kl": 0.125244140625, + "learning_rate": 1e-06, + "loss": -0.0219, + "num_tokens": 337326080.0, + "reward": 1.4522321224212646, + "reward_std": 0.13497215509414673, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45223212242126465, + "rewards/curriculum_aware_reward_fn/std": 0.462191641330719, + "step": 2814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2401.0, + "completions/max_terminated_length": 2401.0, + "completions/mean_length": 745.0357666015625, + "completions/mean_terminated_length": 745.0357666015625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 2.9048233169976787, + "grad_norm": 0.5465977191925049, + "kl": 0.117431640625, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 337480854.0, + "reward": 1.5299108028411865, + "reward_std": 0.1310456395149231, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.529910683631897, + "rewards/curriculum_aware_reward_fn/std": 0.4423249363899231, + "step": 2815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3062.0, + "completions/max_terminated_length": 3062.0, + "completions/mean_length": 760.732177734375, + "completions/mean_terminated_length": 760.732177734375, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 2.9058550425586795, + "grad_norm": 0.6337264776229858, + "kl": 0.1339111328125, + "learning_rate": 1e-06, + "loss": -0.0207, + "num_tokens": 337628125.0, + "reward": 1.5000001192092896, + "reward_std": 0.24028640985488892, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5, + "rewards/curriculum_aware_reward_fn/std": 0.37835583090782166, + "step": 2816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3695.0, + "completions/mean_length": 868.7767944335938, + "completions/mean_terminated_length": 839.7026977539062, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 2.90688676811968, + "grad_norm": 0.49252942204475403, + "kl": 0.114013671875, + "learning_rate": 1e-06, + "loss": 0.0217, + "num_tokens": 337793615.0, + "reward": 1.4214287996292114, + "reward_std": 0.22298011183738708, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4214285910129547, + "rewards/curriculum_aware_reward_fn/std": 0.37414854764938354, + "step": 2817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2959.0, + "completions/mean_length": 753.9375610351562, + "completions/mean_terminated_length": 723.828857421875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 2.907918493680681, + "grad_norm": 0.49966421723365784, + "kl": 0.1248779296875, + "learning_rate": 1e-06, + "loss": -0.0143, + "num_tokens": 337950723.0, + "reward": 1.646875023841858, + "reward_std": 0.17419442534446716, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6558035612106323, + "rewards/curriculum_aware_reward_fn/std": 0.3948283791542053, + "step": 2818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3673.0, + "completions/max_terminated_length": 3673.0, + "completions/mean_length": 786.0982666015625, + "completions/mean_terminated_length": 786.0982666015625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 2.9089502192416816, + "grad_norm": 0.5664613842964172, + "kl": 0.1182861328125, + "learning_rate": 1e-06, + "loss": 0.0722, + "num_tokens": 338106337.0, + "reward": 1.5683037042617798, + "reward_std": 0.25919803977012634, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5772320628166199, + "rewards/curriculum_aware_reward_fn/std": 0.40502965450286865, + "step": 2819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3216.0, + "completions/mean_length": 950.0714721679688, + "completions/mean_terminated_length": 892.8726806640625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 2.9099819448026825, + "grad_norm": 0.46091020107269287, + "kl": 0.1280517578125, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 338281478.0, + "reward": 1.6665180921554565, + "reward_std": 0.20088189840316772, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6665177941322327, + "rewards/curriculum_aware_reward_fn/std": 0.4047793447971344, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3970.0, + "completions/mean_length": 981.2500610351562, + "completions/mean_terminated_length": 895.5228881835938, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 2.9110136703636833, + "grad_norm": 0.4261971116065979, + "kl": 0.109619140625, + "learning_rate": 1e-06, + "loss": 0.0441, + "num_tokens": 338466243.0, + "reward": 1.5625001192092896, + "reward_std": 0.12906794250011444, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5625, + "rewards/curriculum_aware_reward_fn/std": 0.41058626770973206, + "step": 2821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2439.0, + "completions/max_terminated_length": 2439.0, + "completions/mean_length": 724.9107666015625, + "completions/mean_terminated_length": 724.9107666015625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 2.912045395924684, + "grad_norm": 0.5682430267333984, + "kl": 0.1474609375, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 338605568.0, + "reward": 1.591071605682373, + "reward_std": 0.16823892295360565, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5910714268684387, + "rewards/curriculum_aware_reward_fn/std": 0.3809787333011627, + "step": 2822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3659.0, + "completions/max_terminated_length": 3659.0, + "completions/mean_length": 943.0357666015625, + "completions/mean_terminated_length": 943.0357666015625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 2.913077121485685, + "grad_norm": 0.5692936182022095, + "kl": 0.123046875, + "learning_rate": 1e-06, + "loss": -0.0234, + "num_tokens": 338785388.0, + "reward": 1.5294643640518188, + "reward_std": 0.2762223482131958, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5294643044471741, + "rewards/curriculum_aware_reward_fn/std": 0.41206759214401245, + "step": 2823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3892.0, + "completions/mean_length": 930.8125610351562, + "completions/mean_terminated_length": 873.2636108398438, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 2.914108847046686, + "grad_norm": 0.6045404076576233, + "kl": 0.130859375, + "learning_rate": 1e-06, + "loss": 0.066, + "num_tokens": 338955327.0, + "reward": 1.4459823369979858, + "reward_std": 0.21156080067157745, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4459821581840515, + "rewards/curriculum_aware_reward_fn/std": 0.3662654459476471, + "step": 2824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3253.0, + "completions/mean_length": 999.607177734375, + "completions/mean_terminated_length": 914.38525390625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 2.915140572607686, + "grad_norm": 0.3792083263397217, + "kl": 0.1165771484375, + "learning_rate": 1e-06, + "loss": -0.0088, + "num_tokens": 339142323.0, + "reward": 1.6156251430511475, + "reward_std": 0.11980683356523514, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6156249642372131, + "rewards/curriculum_aware_reward_fn/std": 0.5581110119819641, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3649.0, + "completions/mean_length": 964.4107666015625, + "completions/mean_terminated_length": 878.2201538085938, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 2.916172298168687, + "grad_norm": 0.5529889464378357, + "kl": 0.1221923828125, + "learning_rate": 1e-06, + "loss": -0.0369, + "num_tokens": 339312434.0, + "reward": 1.3674108982086182, + "reward_std": 0.2013842761516571, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.36741071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.3771460950374603, + "step": 2826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3098.0, + "completions/mean_length": 1062.6785888671875, + "completions/mean_terminated_length": 1035.351318359375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 2.917204023729688, + "grad_norm": 0.5321182012557983, + "kl": 0.1229248046875, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 339513739.0, + "reward": 1.4044643640518188, + "reward_std": 0.2600024342536926, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4044642746448517, + "rewards/curriculum_aware_reward_fn/std": 0.416038453578949, + "step": 2827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3805.0, + "completions/mean_length": 987.0535888671875, + "completions/mean_terminated_length": 959.0450439453125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 2.9182357492906887, + "grad_norm": 0.5274437665939331, + "kl": 0.1236572265625, + "learning_rate": 1e-06, + "loss": -0.0428, + "num_tokens": 339698146.0, + "reward": 1.4517858028411865, + "reward_std": 0.21504361927509308, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.45178571343421936, + "rewards/curriculum_aware_reward_fn/std": 0.5527135729789734, + "step": 2828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3789.0, + "completions/mean_length": 947.3125610351562, + "completions/mean_terminated_length": 860.6513671875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 2.9192674748516896, + "grad_norm": 0.5115098357200623, + "kl": 0.1151123046875, + "learning_rate": 1e-06, + "loss": -0.0245, + "num_tokens": 339882113.0, + "reward": 1.4446429014205933, + "reward_std": 0.18274597823619843, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4446428716182709, + "rewards/curriculum_aware_reward_fn/std": 0.4277195334434509, + "step": 2829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2271.0, + "completions/mean_length": 755.4107666015625, + "completions/mean_terminated_length": 725.3153076171875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.92029920041269, + "grad_norm": 0.487190306186676, + "kl": 0.1287841796875, + "learning_rate": 1e-06, + "loss": 0.0334, + "num_tokens": 340031308.0, + "reward": 1.6031250953674316, + "reward_std": 0.17092326283454895, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6031250357627869, + "rewards/curriculum_aware_reward_fn/std": 0.44262126088142395, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2653.0, + "completions/mean_length": 825.1160888671875, + "completions/mean_terminated_length": 795.648681640625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 2.921330925973691, + "grad_norm": 0.6423907279968262, + "kl": 0.129638671875, + "learning_rate": 1e-06, + "loss": 0.0496, + "num_tokens": 340195310.0, + "reward": 1.6241074800491333, + "reward_std": 0.18456201255321503, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6330357193946838, + "rewards/curriculum_aware_reward_fn/std": 0.3969792127609253, + "step": 2831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3153.0, + "completions/mean_length": 932.15185546875, + "completions/mean_terminated_length": 903.648681640625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 2.9223626515346917, + "grad_norm": 0.5027857422828674, + "kl": 0.126953125, + "learning_rate": 1e-06, + "loss": -0.0407, + "num_tokens": 340367212.0, + "reward": 1.581696629524231, + "reward_std": 0.16816502809524536, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5816964507102966, + "rewards/curriculum_aware_reward_fn/std": 0.45227134227752686, + "step": 2832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2578.0, + "completions/max_terminated_length": 2578.0, + "completions/mean_length": 864.6428833007812, + "completions/mean_terminated_length": 864.6428833007812, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.9233943770956925, + "grad_norm": 0.5807149410247803, + "kl": 0.13818359375, + "learning_rate": 1e-06, + "loss": -0.0229, + "num_tokens": 340532652.0, + "reward": 1.5982143878936768, + "reward_std": 0.21378812193870544, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5982142686843872, + "rewards/curriculum_aware_reward_fn/std": 0.39931970834732056, + "step": 2833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3418.0, + "completions/mean_length": 1074.1160888671875, + "completions/mean_terminated_length": 990.9448852539062, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 2.9244261026566933, + "grad_norm": 0.5498697757720947, + "kl": 0.1300048828125, + "learning_rate": 1e-06, + "loss": -0.0208, + "num_tokens": 340714651.0, + "reward": 1.7075893878936768, + "reward_std": 0.2345176488161087, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.7165178656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3282075524330139, + "step": 2834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3567.0, + "completions/mean_length": 1098.1875, + "completions/mean_terminated_length": 1043.6817626953125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 2.925457828217694, + "grad_norm": 0.5744073987007141, + "kl": 0.12158203125, + "learning_rate": 1e-06, + "loss": -0.0426, + "num_tokens": 340906136.0, + "reward": 1.4392857551574707, + "reward_std": 0.18629738688468933, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4392856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.37945127487182617, + "step": 2835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2680.0, + "completions/max_terminated_length": 2680.0, + "completions/mean_length": 968.0982666015625, + "completions/mean_terminated_length": 968.0982666015625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 2.926489553778695, + "grad_norm": 0.4878877103328705, + "kl": 0.124755859375, + "learning_rate": 1e-06, + "loss": -0.0356, + "num_tokens": 341083996.0, + "reward": 1.4049108028411865, + "reward_std": 0.24925042688846588, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41383928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.41844895482063293, + "step": 2836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3865.0, + "completions/mean_length": 1233.8035888671875, + "completions/mean_terminated_length": 1127.7962646484375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 2.927521279339696, + "grad_norm": 0.4677523672580719, + "kl": 0.123046875, + "learning_rate": 1e-06, + "loss": 0.0649, + "num_tokens": 341291300.0, + "reward": 1.5093750953674316, + "reward_std": 0.24394011497497559, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5093749761581421, + "rewards/curriculum_aware_reward_fn/std": 0.428202360868454, + "step": 2837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3826.0, + "completions/mean_length": 1228.982177734375, + "completions/mean_terminated_length": 1203.1531982421875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 2.9285530049006963, + "grad_norm": 0.41032853722572327, + "kl": 0.107666015625, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 341499247.0, + "reward": 1.471428632736206, + "reward_std": 0.20130853354930878, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47142860293388367, + "rewards/curriculum_aware_reward_fn/std": 0.42994415760040283, + "step": 2838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3315.0, + "completions/max_terminated_length": 3315.0, + "completions/mean_length": 947.2232666015625, + "completions/mean_terminated_length": 947.2232666015625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 2.929584730461697, + "grad_norm": 0.5380738973617554, + "kl": 0.12353515625, + "learning_rate": 1e-06, + "loss": 0.0548, + "num_tokens": 341665018.0, + "reward": 1.5549108982086182, + "reward_std": 0.22859680652618408, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.563839316368103, + "rewards/curriculum_aware_reward_fn/std": 0.4066568613052368, + "step": 2839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3756.0, + "completions/mean_length": 1066.46435546875, + "completions/mean_terminated_length": 983.08251953125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 2.930616456022698, + "grad_norm": 0.4303039610385895, + "kl": 0.1212158203125, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 341859537.0, + "reward": 1.599107265472412, + "reward_std": 0.14463859796524048, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5991071462631226, + "rewards/curriculum_aware_reward_fn/std": 0.4139994978904724, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3358.0, + "completions/max_terminated_length": 3358.0, + "completions/mean_length": 786.9910888671875, + "completions/mean_terminated_length": 786.9910888671875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 2.9316481815836988, + "grad_norm": 0.5593550205230713, + "kl": 0.13818359375, + "learning_rate": 1e-06, + "loss": -0.0497, + "num_tokens": 342008076.0, + "reward": 1.612053632736206, + "reward_std": 0.1519756317138672, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6120535731315613, + "rewards/curriculum_aware_reward_fn/std": 0.4075440466403961, + "step": 2841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3221.0, + "completions/mean_length": 1076.33935546875, + "completions/mean_terminated_length": 1021.4363403320312, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 2.9326799071446996, + "grad_norm": 0.4824768006801605, + "kl": 0.129638671875, + "learning_rate": 1e-06, + "loss": -0.0626, + "num_tokens": 342193264.0, + "reward": 1.5526785850524902, + "reward_std": 0.25366339087486267, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5705357193946838, + "rewards/curriculum_aware_reward_fn/std": 0.4081684947013855, + "step": 2842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3469.0, + "completions/mean_length": 1164.2679443359375, + "completions/mean_terminated_length": 1110.963623046875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 2.9337116327057, + "grad_norm": 0.5000720024108887, + "kl": 0.1190185546875, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 342397911.0, + "reward": 1.372321605682373, + "reward_std": 0.24542978405952454, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.3812499940395355, + "rewards/curriculum_aware_reward_fn/std": 0.41509386897087097, + "step": 2843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3398.0, + "completions/mean_length": 802.9285888671875, + "completions/mean_terminated_length": 773.2612915039062, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 2.934743358266701, + "grad_norm": 0.4824245572090149, + "kl": 0.140869140625, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 342545232.0, + "reward": 1.7607142925262451, + "reward_std": 0.14560459554195404, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7607142329216003, + "rewards/curriculum_aware_reward_fn/std": 0.37712928652763367, + "step": 2844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3877.0, + "completions/mean_length": 1002.3660888671875, + "completions/mean_terminated_length": 946.1181640625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 2.9357750838277017, + "grad_norm": 0.4928884208202362, + "kl": 0.124755859375, + "learning_rate": 1e-06, + "loss": 0.0697, + "num_tokens": 342713812.0, + "reward": 1.5191963911056519, + "reward_std": 0.16029249131679535, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5191964507102966, + "rewards/curriculum_aware_reward_fn/std": 0.44485270977020264, + "step": 2845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2634.0, + "completions/mean_length": 991.1250610351562, + "completions/mean_terminated_length": 905.669677734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.9368068093887025, + "grad_norm": 0.5415922999382019, + "kl": 0.121337890625, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 342893194.0, + "reward": 1.5223214626312256, + "reward_std": 0.1560944765806198, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5223214030265808, + "rewards/curriculum_aware_reward_fn/std": 0.3887980580329895, + "step": 2846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3737.0, + "completions/mean_length": 981.9910888671875, + "completions/mean_terminated_length": 953.9369506835938, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 2.9378385349497034, + "grad_norm": 0.5075610280036926, + "kl": 0.116943359375, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 343080191.0, + "reward": 1.4678572416305542, + "reward_std": 0.307505339384079, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46785715222358704, + "rewards/curriculum_aware_reward_fn/std": 0.4610435664653778, + "step": 2847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 972.3035888671875, + "completions/mean_terminated_length": 944.1621704101562, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.938870260510704, + "grad_norm": 0.4480708837509155, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 343258900.0, + "reward": 1.5433037281036377, + "reward_std": 0.1520136296749115, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.561160683631897, + "rewards/curriculum_aware_reward_fn/std": 0.36383283138275146, + "step": 2848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2760.0, + "completions/max_terminated_length": 2760.0, + "completions/mean_length": 842.857177734375, + "completions/mean_terminated_length": 842.857177734375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 2.939901986071705, + "grad_norm": 0.5809365510940552, + "kl": 0.143798828125, + "learning_rate": 1e-06, + "loss": 0.0887, + "num_tokens": 343417287.0, + "reward": 1.5294644832611084, + "reward_std": 0.19331319630146027, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5294643044471741, + "rewards/curriculum_aware_reward_fn/std": 0.42243146896362305, + "step": 2849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2616.0, + "completions/max_terminated_length": 2616.0, + "completions/mean_length": 855.7767944335938, + "completions/mean_terminated_length": 855.7767944335938, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 2.940933711632706, + "grad_norm": 0.5113884806632996, + "kl": 0.12109375, + "learning_rate": 1e-06, + "loss": -0.0412, + "num_tokens": 343576869.0, + "reward": 1.41785728931427, + "reward_std": 0.19872523844242096, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.4357142746448517, + "rewards/curriculum_aware_reward_fn/std": 0.44094642996788025, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2965.0, + "completions/max_terminated_length": 2965.0, + "completions/mean_length": 716.7500610351562, + "completions/mean_terminated_length": 716.7500610351562, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 2.9419654371937067, + "grad_norm": 0.5477640628814697, + "kl": 0.140869140625, + "learning_rate": 1e-06, + "loss": -0.017, + "num_tokens": 343720810.0, + "reward": 1.7714285850524902, + "reward_std": 0.2532138526439667, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.7714285254478455, + "rewards/curriculum_aware_reward_fn/std": 0.49190354347229004, + "step": 2851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3282.0, + "completions/mean_length": 754.9553833007812, + "completions/mean_terminated_length": 724.8558959960938, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.942997162754707, + "grad_norm": 0.5419442057609558, + "kl": 0.13427734375, + "learning_rate": 1e-06, + "loss": 0.0707, + "num_tokens": 343859496.0, + "reward": 1.6598217487335205, + "reward_std": 0.18061259388923645, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.668749988079071, + "rewards/curriculum_aware_reward_fn/std": 0.5330147743225098, + "step": 2852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2810.0, + "completions/max_terminated_length": 2810.0, + "completions/mean_length": 825.6339721679688, + "completions/mean_terminated_length": 825.6339721679688, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 2.944028888315708, + "grad_norm": 0.5500017404556274, + "kl": 0.1317138671875, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 344020070.0, + "reward": 1.6044644117355347, + "reward_std": 0.27769601345062256, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6133928298950195, + "rewards/curriculum_aware_reward_fn/std": 0.41924333572387695, + "step": 2853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 1099.1429443359375, + "completions/mean_terminated_length": 1016.6605224609375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 2.945060613876709, + "grad_norm": 0.46461525559425354, + "kl": 0.1148681640625, + "learning_rate": 1e-06, + "loss": 0.0834, + "num_tokens": 344206762.0, + "reward": 1.4647324085235596, + "reward_std": 0.21833495795726776, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4736606776714325, + "rewards/curriculum_aware_reward_fn/std": 0.42542821168899536, + "step": 2854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3170.0, + "completions/mean_length": 833.1785888671875, + "completions/mean_terminated_length": 803.7838134765625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 2.9460923394377097, + "grad_norm": 0.4217899739742279, + "kl": 0.126708984375, + "learning_rate": 1e-06, + "loss": 0.0102, + "num_tokens": 344364660.0, + "reward": 1.5799108743667603, + "reward_std": 0.15548360347747803, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5799107551574707, + "rewards/curriculum_aware_reward_fn/std": 0.4215191602706909, + "step": 2855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3976.0, + "completions/mean_length": 958.90185546875, + "completions/mean_terminated_length": 901.8635864257812, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 2.9471240649987105, + "grad_norm": 0.4388851821422577, + "kl": 0.1221923828125, + "learning_rate": 1e-06, + "loss": -0.0618, + "num_tokens": 344538795.0, + "reward": 1.4928573369979858, + "reward_std": 0.12822991609573364, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5017856955528259, + "rewards/curriculum_aware_reward_fn/std": 0.4532131850719452, + "step": 2856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2801.0, + "completions/mean_length": 834.2053833007812, + "completions/mean_terminated_length": 774.8999633789062, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 2.948155790559711, + "grad_norm": 0.5034121870994568, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": 0.0575, + "num_tokens": 344688492.0, + "reward": 1.7308037281036377, + "reward_std": 0.2394391894340515, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.7397321462631226, + "rewards/curriculum_aware_reward_fn/std": 0.3914133608341217, + "step": 2857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3984.0, + "completions/mean_length": 1040.169677734375, + "completions/mean_terminated_length": 984.6090698242188, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 2.9491875161207117, + "grad_norm": 0.42861661314964294, + "kl": 0.1171875, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 344872023.0, + "reward": 1.3441965579986572, + "reward_std": 0.1831224113702774, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3441964089870453, + "rewards/curriculum_aware_reward_fn/std": 0.43468666076660156, + "step": 2858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3575.0, + "completions/max_terminated_length": 3575.0, + "completions/mean_length": 857.857177734375, + "completions/mean_terminated_length": 857.857177734375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 2.9502192416817126, + "grad_norm": 0.4969906806945801, + "kl": 0.1148681640625, + "learning_rate": 1e-06, + "loss": 0.0681, + "num_tokens": 345027057.0, + "reward": 1.5781251192092896, + "reward_std": 0.16588565707206726, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5870535969734192, + "rewards/curriculum_aware_reward_fn/std": 0.40374070405960083, + "step": 2859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3894.0, + "completions/mean_length": 780.8392944335938, + "completions/mean_terminated_length": 750.9729614257812, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 2.9512509672427134, + "grad_norm": 0.4635484516620636, + "kl": 0.12744140625, + "learning_rate": 1e-06, + "loss": 0.0418, + "num_tokens": 345183251.0, + "reward": 1.5647321939468384, + "reward_std": 0.07009764015674591, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5647321343421936, + "rewards/curriculum_aware_reward_fn/std": 0.46443989872932434, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 652.5982666015625, + "completions/mean_terminated_length": 589.9909057617188, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 2.9522826928037142, + "grad_norm": 0.6120730638504028, + "kl": 0.1446533203125, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 345320483.0, + "reward": 1.5982145071029663, + "reward_std": 0.1766142100095749, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.6160714030265808, + "rewards/curriculum_aware_reward_fn/std": 0.43827131390571594, + "step": 2861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3693.0, + "completions/mean_length": 883.9910888671875, + "completions/mean_terminated_length": 825.5908813476562, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 2.953314418364715, + "grad_norm": 0.5264288783073425, + "kl": 0.107177734375, + "learning_rate": 1e-06, + "loss": -0.0233, + "num_tokens": 345486739.0, + "reward": 1.5410715341567993, + "reward_std": 0.19420649111270905, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5410714149475098, + "rewards/curriculum_aware_reward_fn/std": 0.45198333263397217, + "step": 2862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3149.0, + "completions/max_terminated_length": 3149.0, + "completions/mean_length": 692.7142944335938, + "completions/mean_terminated_length": 692.7142944335938, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 2.954346143925716, + "grad_norm": 0.6059958338737488, + "kl": 0.13427734375, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 345622780.0, + "reward": 1.7245537042617798, + "reward_std": 0.24420930445194244, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.7334821820259094, + "rewards/curriculum_aware_reward_fn/std": 0.381579726934433, + "step": 2863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3316.0, + "completions/mean_length": 1037.625, + "completions/mean_terminated_length": 1010.0720825195312, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 2.9553778694867168, + "grad_norm": 0.5002031922340393, + "kl": 0.1168212890625, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 345811500.0, + "reward": 1.4736608266830444, + "reward_std": 0.22914379835128784, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47366073727607727, + "rewards/curriculum_aware_reward_fn/std": 0.39881354570388794, + "step": 2864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1879.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 682.794677734375, + "completions/mean_terminated_length": 682.794677734375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 2.956409595047717, + "grad_norm": 0.5402216911315918, + "kl": 0.12255859375, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 345950497.0, + "reward": 1.5294643640518188, + "reward_std": 0.1578807830810547, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5383928418159485, + "rewards/curriculum_aware_reward_fn/std": 0.4456993043422699, + "step": 2865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3317.0, + "completions/max_terminated_length": 3317.0, + "completions/mean_length": 645.2589721679688, + "completions/mean_terminated_length": 645.2589721679688, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 2.957441320608718, + "grad_norm": 0.7363216280937195, + "kl": 0.1314697265625, + "learning_rate": 1e-06, + "loss": -0.0286, + "num_tokens": 346073054.0, + "reward": 1.5375001430511475, + "reward_std": 0.22698816657066345, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5375000238418579, + "rewards/curriculum_aware_reward_fn/std": 0.3998591899871826, + "step": 2866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3290.0, + "completions/mean_length": 780.4464721679688, + "completions/mean_terminated_length": 750.5765991210938, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.958473046169719, + "grad_norm": 0.5626248717308044, + "kl": 0.122314453125, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 346223968.0, + "reward": 1.4299107789993286, + "reward_std": 0.22761982679367065, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.42991071939468384, + "rewards/curriculum_aware_reward_fn/std": 0.4294591248035431, + "step": 2867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3555.0, + "completions/max_terminated_length": 3555.0, + "completions/mean_length": 774.9285888671875, + "completions/mean_terminated_length": 774.9285888671875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 2.9595047717307197, + "grad_norm": 0.633262038230896, + "kl": 0.13623046875, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 346375412.0, + "reward": 1.4750001430511475, + "reward_std": 0.26338452100753784, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.48392853140830994, + "rewards/curriculum_aware_reward_fn/std": 0.3957666754722595, + "step": 2868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3717.0, + "completions/max_terminated_length": 3717.0, + "completions/mean_length": 777.6428833007812, + "completions/mean_terminated_length": 777.6428833007812, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 2.9605364972917205, + "grad_norm": 0.5764303207397461, + "kl": 0.12646484375, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 346521896.0, + "reward": 1.5575894117355347, + "reward_std": 0.19927051663398743, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5575893521308899, + "rewards/curriculum_aware_reward_fn/std": 0.38783350586891174, + "step": 2869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2798.0, + "completions/mean_length": 924.8035888671875, + "completions/mean_terminated_length": 896.2342529296875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 2.961568222852721, + "grad_norm": 0.5695756077766418, + "kl": 0.1204833984375, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 346698218.0, + "reward": 1.4745537042617798, + "reward_std": 0.205791175365448, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.47455358505249023, + "rewards/curriculum_aware_reward_fn/std": 0.4199954569339752, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3622.0, + "completions/max_terminated_length": 3622.0, + "completions/mean_length": 910.5625610351562, + "completions/mean_terminated_length": 910.5625610351562, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.9625999484137218, + "grad_norm": 0.5911626815795898, + "kl": 0.130615234375, + "learning_rate": 1e-06, + "loss": 0.035, + "num_tokens": 346872168.0, + "reward": 1.4625000953674316, + "reward_std": 0.24348336458206177, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.47142860293388367, + "rewards/curriculum_aware_reward_fn/std": 0.36028730869293213, + "step": 2871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3325.0, + "completions/mean_length": 850.607177734375, + "completions/mean_terminated_length": 821.369384765625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 2.9636316739747226, + "grad_norm": 0.5764513611793518, + "kl": 0.12646484375, + "learning_rate": 1e-06, + "loss": 0.0217, + "num_tokens": 347033201.0, + "reward": 1.5437501668930054, + "reward_std": 0.26699310541152954, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5616071820259094, + "rewards/curriculum_aware_reward_fn/std": 0.44716235995292664, + "step": 2872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3913.0, + "completions/mean_length": 1122.884033203125, + "completions/mean_terminated_length": 1041.0550537109375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 2.9646633995357234, + "grad_norm": 0.6206802129745483, + "kl": 0.130615234375, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 347237056.0, + "reward": 1.6232144832611084, + "reward_std": 0.21191005408763885, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.6321429014205933, + "rewards/curriculum_aware_reward_fn/std": 0.5180028080940247, + "step": 2873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4092.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 929.3303833007812, + "completions/mean_terminated_length": 929.3303833007812, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 2.9656951250967243, + "grad_norm": 0.500167965888977, + "kl": 0.1180419921875, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 347408621.0, + "reward": 1.5196430683135986, + "reward_std": 0.18252387642860413, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5196428894996643, + "rewards/curriculum_aware_reward_fn/std": 0.4163525402545929, + "step": 2874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3737.0, + "completions/mean_length": 942.8214721679688, + "completions/mean_terminated_length": 885.4909057617188, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 2.966726850657725, + "grad_norm": 0.5842325687408447, + "kl": 0.1158447265625, + "learning_rate": 1e-06, + "loss": -0.0194, + "num_tokens": 347585309.0, + "reward": 1.4964287281036377, + "reward_std": 0.24344106018543243, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.49642854928970337, + "rewards/curriculum_aware_reward_fn/std": 0.3867579400539398, + "step": 2875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3725.0, + "completions/mean_length": 990.0714721679688, + "completions/mean_terminated_length": 904.5870971679688, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 2.967758576218726, + "grad_norm": 0.4529733657836914, + "kl": 0.1328125, + "learning_rate": 1e-06, + "loss": 0.0215, + "num_tokens": 347772077.0, + "reward": 1.6580358743667603, + "reward_std": 0.14680266380310059, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6580356955528259, + "rewards/curriculum_aware_reward_fn/std": 0.530958354473114, + "step": 2876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2847.0, + "completions/mean_length": 947.6964721679688, + "completions/mean_terminated_length": 919.3333740234375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 2.968790301779727, + "grad_norm": 0.5367709994316101, + "kl": 0.12158203125, + "learning_rate": 1e-06, + "loss": 0.0267, + "num_tokens": 347946119.0, + "reward": 1.5232144594192505, + "reward_std": 0.20710141956806183, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5321428179740906, + "rewards/curriculum_aware_reward_fn/std": 0.36937108635902405, + "step": 2877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2400.0, + "completions/mean_length": 1164.02685546875, + "completions/mean_terminated_length": 968.5619506835938, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 2.9698220273407276, + "grad_norm": 0.5432129502296448, + "kl": 0.1143798828125, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 348145083.0, + "reward": 1.3718750476837158, + "reward_std": 0.2610403299331665, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.38973215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.35635408759117126, + "step": 2878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3759.0, + "completions/mean_length": 971.1428833007812, + "completions/mean_terminated_length": 914.3272705078125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 2.970853752901728, + "grad_norm": 0.517999529838562, + "kl": 0.1328125, + "learning_rate": 1e-06, + "loss": -0.0221, + "num_tokens": 348315248.0, + "reward": 1.4370535612106323, + "reward_std": 0.17417265474796295, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.43705353140830994, + "rewards/curriculum_aware_reward_fn/std": 0.3891979157924652, + "step": 2879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4024.0, + "completions/mean_length": 895.1964721679688, + "completions/mean_terminated_length": 837.0, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 2.971885478462729, + "grad_norm": 0.5721215605735779, + "kl": 0.13330078125, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 348487785.0, + "reward": 1.5169644355773926, + "reward_std": 0.23866644501686096, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.516964316368103, + "rewards/curriculum_aware_reward_fn/std": 0.4103148579597473, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3408.0, + "completions/mean_length": 931.08935546875, + "completions/mean_terminated_length": 873.54541015625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 2.9729172040237297, + "grad_norm": 0.5178924798965454, + "kl": 0.12646484375, + "learning_rate": 1e-06, + "loss": -0.0215, + "num_tokens": 348654164.0, + "reward": 1.5508930683135986, + "reward_std": 0.23712819814682007, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.1330273300409317, + "rewards/curriculum_aware_reward_fn/mean": 0.5687499642372131, + "rewards/curriculum_aware_reward_fn/std": 0.4166644215583801, + "step": 2881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3908.0, + "completions/mean_length": 1176.634033203125, + "completions/mean_terminated_length": 982.0095825195312, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.9739489295847306, + "grad_norm": 0.42219457030296326, + "kl": 0.10546875, + "learning_rate": 1e-06, + "loss": -0.0427, + "num_tokens": 348850830.0, + "reward": 1.4392858743667603, + "reward_std": 0.23737432062625885, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4482142925262451, + "rewards/curriculum_aware_reward_fn/std": 0.47081151604652405, + "step": 2882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 1074.0804443359375, + "completions/mean_terminated_length": 990.908203125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.9749806551457314, + "grad_norm": 0.48693951964378357, + "kl": 0.135498046875, + "learning_rate": 1e-06, + "loss": 0.0446, + "num_tokens": 349037571.0, + "reward": 1.5375001430511475, + "reward_std": 0.2253728061914444, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5464285612106323, + "rewards/curriculum_aware_reward_fn/std": 0.39925122261047363, + "step": 2883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3462.0, + "completions/mean_length": 918.5178833007812, + "completions/mean_terminated_length": 860.7454223632812, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 2.976012380706732, + "grad_norm": 0.6096733212471008, + "kl": 0.13134765625, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 349209437.0, + "reward": 1.555803656578064, + "reward_std": 0.21068914234638214, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5558035969734192, + "rewards/curriculum_aware_reward_fn/std": 0.4252580404281616, + "step": 2884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044642857142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3727.0, + "completions/mean_length": 1129.982177734375, + "completions/mean_terminated_length": 991.3831176757812, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 2.9770441062677326, + "grad_norm": 0.5170513391494751, + "kl": 0.124755859375, + "learning_rate": 1e-06, + "loss": -0.0205, + "num_tokens": 349406653.0, + "reward": 1.4973214864730835, + "reward_std": 0.2478838711977005, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5062500238418579, + "rewards/curriculum_aware_reward_fn/std": 0.4440357983112335, + "step": 2885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4037.0, + "completions/mean_length": 1047.7679443359375, + "completions/mean_terminated_length": 934.870361328125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 2.9780758318287335, + "grad_norm": 0.6094416379928589, + "kl": 0.1243896484375, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 349590188.0, + "reward": 1.4638394117355347, + "reward_std": 0.22185523808002472, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4638392925262451, + "rewards/curriculum_aware_reward_fn/std": 0.41471850872039795, + "step": 2886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044642857142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3679.0, + "completions/mean_length": 1009.02685546875, + "completions/mean_terminated_length": 864.7756958007812, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 2.9791075573897343, + "grad_norm": 0.4644887447357178, + "kl": 0.1251220703125, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 349766864.0, + "reward": 1.606696605682373, + "reward_std": 0.18360301852226257, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6066964268684387, + "rewards/curriculum_aware_reward_fn/std": 0.45329102873802185, + "step": 2887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 974.77685546875, + "completions/mean_terminated_length": 888.8715209960938, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 2.980139282950735, + "grad_norm": 0.4979686439037323, + "kl": 0.1282958984375, + "learning_rate": 1e-06, + "loss": 0.0453, + "num_tokens": 349941783.0, + "reward": 1.6200894117355347, + "reward_std": 0.1684245765209198, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6200892329216003, + "rewards/curriculum_aware_reward_fn/std": 0.4342571496963501, + "step": 2888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3619.0, + "completions/mean_length": 1229.482177734375, + "completions/mean_terminated_length": 1150.587158203125, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 2.981171008511736, + "grad_norm": 0.47304999828338623, + "kl": 0.11767578125, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 350158060.0, + "reward": 1.4955357313156128, + "reward_std": 0.2193337231874466, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4955357611179352, + "rewards/curriculum_aware_reward_fn/std": 0.3761201798915863, + "step": 2889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3940.0, + "completions/mean_length": 998.1339721679688, + "completions/mean_terminated_length": 912.8715209960938, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 2.982202734072737, + "grad_norm": 0.5146641135215759, + "kl": 0.12060546875, + "learning_rate": 1e-06, + "loss": 0.0306, + "num_tokens": 350327566.0, + "reward": 1.5937501192092896, + "reward_std": 0.216999813914299, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.59375, + "rewards/curriculum_aware_reward_fn/std": 0.39135506749153137, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0535714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3877.0, + "completions/mean_length": 1154.3035888671875, + "completions/mean_terminated_length": 987.79248046875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 2.9832344596337377, + "grad_norm": 0.45033833384513855, + "kl": 0.117919921875, + "learning_rate": 1e-06, + "loss": -0.0186, + "num_tokens": 350530831.0, + "reward": 1.4026787281036377, + "reward_std": 0.1656990796327591, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.41160717606544495, + "rewards/curriculum_aware_reward_fn/std": 0.42838382720947266, + "step": 2891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3993.0, + "completions/mean_length": 1066.4285888671875, + "completions/mean_terminated_length": 864.4571533203125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 2.984266185194738, + "grad_norm": 0.5000366568565369, + "kl": 0.1221923828125, + "learning_rate": 1e-06, + "loss": 0.0713, + "num_tokens": 350718878.0, + "reward": 1.591071605682373, + "reward_std": 0.22192177176475525, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5999999642372131, + "rewards/curriculum_aware_reward_fn/std": 0.3897908926010132, + "step": 2892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3983.0, + "completions/mean_length": 1087.821533203125, + "completions/mean_terminated_length": 1005.0274658203125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 2.985297910755739, + "grad_norm": 0.4949439465999603, + "kl": 0.1258544921875, + "learning_rate": 1e-06, + "loss": 0.0599, + "num_tokens": 350908691.0, + "reward": 1.5763394832611084, + "reward_std": 0.24737325310707092, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5763393044471741, + "rewards/curriculum_aware_reward_fn/std": 0.4229327440261841, + "step": 2893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4035.0, + "completions/mean_length": 1050.2857666015625, + "completions/mean_terminated_length": 937.4815063476562, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 2.9863296363167398, + "grad_norm": 0.3857535123825073, + "kl": 0.11279296875, + "learning_rate": 1e-06, + "loss": -0.0305, + "num_tokens": 351087601.0, + "reward": 1.4616073369979858, + "reward_std": 0.1593252718448639, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4705357253551483, + "rewards/curriculum_aware_reward_fn/std": 0.458677738904953, + "step": 2894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3312.0, + "completions/mean_length": 916.4107666015625, + "completions/mean_terminated_length": 887.7658081054688, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 2.9873613618777406, + "grad_norm": 0.5426157712936401, + "kl": 0.13037109375, + "learning_rate": 1e-06, + "loss": 0.0921, + "num_tokens": 351259917.0, + "reward": 1.533482313156128, + "reward_std": 0.2218896448612213, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5424107313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4264989197254181, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3318.0, + "completions/mean_length": 940.4553833007812, + "completions/mean_terminated_length": 853.60546875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 2.9883930874387414, + "grad_norm": 0.5972111225128174, + "kl": 0.1295166015625, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 351431013.0, + "reward": 1.4687501192092896, + "reward_std": 0.19047865271568298, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46875, + "rewards/curriculum_aware_reward_fn/std": 0.4237395226955414, + "step": 2896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0535714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3235.0, + "completions/mean_length": 1138.3660888671875, + "completions/mean_terminated_length": 970.9528198242188, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 2.989424812999742, + "grad_norm": 0.5497664213180542, + "kl": 0.1187744140625, + "learning_rate": 1e-06, + "loss": -0.047, + "num_tokens": 351632073.0, + "reward": 1.4107143878936768, + "reward_std": 0.24470612406730652, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.4107142984867096, + "rewards/curriculum_aware_reward_fn/std": 0.35134419798851013, + "step": 2897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3804.0, + "completions/max_terminated_length": 3804.0, + "completions/mean_length": 744.1428833007812, + "completions/mean_terminated_length": 744.1428833007812, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 2.9904565385607427, + "grad_norm": 0.6259168386459351, + "kl": 0.130126953125, + "learning_rate": 1e-06, + "loss": 0.037, + "num_tokens": 351781336.0, + "reward": 1.4834822416305542, + "reward_std": 0.25032472610473633, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.48348215222358704, + "rewards/curriculum_aware_reward_fn/std": 0.41260480880737305, + "step": 2898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2275.0, + "completions/max_terminated_length": 2275.0, + "completions/mean_length": 776.482177734375, + "completions/mean_terminated_length": 776.482177734375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 2.9914882641217435, + "grad_norm": 0.4901755750179291, + "kl": 0.1298828125, + "learning_rate": 1e-06, + "loss": 0.0476, + "num_tokens": 351937238.0, + "reward": 1.4642857313156128, + "reward_std": 0.1534547507762909, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.4732142388820648, + "rewards/curriculum_aware_reward_fn/std": 0.44615110754966736, + "step": 2899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2774.0, + "completions/max_terminated_length": 2774.0, + "completions/mean_length": 758.8660888671875, + "completions/mean_terminated_length": 758.8660888671875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 2.9925199896827444, + "grad_norm": 0.6221413612365723, + "kl": 0.1455078125, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 352083523.0, + "reward": 1.4660714864730835, + "reward_std": 0.22209754586219788, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.46607139706611633, + "rewards/curriculum_aware_reward_fn/std": 0.43904146552085876, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2544.0, + "completions/mean_length": 846.5357666015625, + "completions/mean_terminated_length": 817.2612915039062, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.993551715243745, + "grad_norm": 0.5860307216644287, + "kl": 0.1160888671875, + "learning_rate": 1e-06, + "loss": -0.0611, + "num_tokens": 352247390.0, + "reward": 1.5151787996292114, + "reward_std": 0.20008236169815063, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.5151785612106323, + "rewards/curriculum_aware_reward_fn/std": 0.416594922542572, + "step": 2901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3071.0, + "completions/mean_length": 966.8035888671875, + "completions/mean_terminated_length": 880.6788330078125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 2.994583440804746, + "grad_norm": 0.5898165702819824, + "kl": 0.137451171875, + "learning_rate": 1e-06, + "loss": -0.0341, + "num_tokens": 352413310.0, + "reward": 1.6013394594192505, + "reward_std": 0.2658516764640808, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6013393402099609, + "rewards/curriculum_aware_reward_fn/std": 0.40959545969963074, + "step": 2902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3570.0, + "completions/mean_length": 728.4553833007812, + "completions/mean_terminated_length": 698.1171264648438, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 2.995615166365747, + "grad_norm": 0.6526123881340027, + "kl": 0.130859375, + "learning_rate": 1e-06, + "loss": 0.04, + "num_tokens": 352562685.0, + "reward": 1.6022323369979858, + "reward_std": 0.20173947513103485, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.6022320985794067, + "rewards/curriculum_aware_reward_fn/std": 0.40810418128967285, + "step": 2903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4024.0, + "completions/max_terminated_length": 4024.0, + "completions/mean_length": 784.669677734375, + "completions/mean_terminated_length": 784.669677734375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.9966468919267477, + "grad_norm": 0.582514762878418, + "kl": 0.1290283203125, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 352714296.0, + "reward": 1.5281251668930054, + "reward_std": 0.17819184064865112, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.528124988079071, + "rewards/curriculum_aware_reward_fn/std": 0.39385947585105896, + "step": 2904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2639.0, + "completions/mean_length": 700.7857666015625, + "completions/mean_terminated_length": 670.1982421875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 2.997678617487748, + "grad_norm": 0.6107363700866699, + "kl": 0.1234130859375, + "learning_rate": 1e-06, + "loss": 0.0158, + "num_tokens": 352856750.0, + "reward": 1.588392972946167, + "reward_std": 0.2505446672439575, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09449111670255661, + "rewards/curriculum_aware_reward_fn/mean": 0.5973213911056519, + "rewards/curriculum_aware_reward_fn/std": 0.4182136654853821, + "step": 2905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3624.0, + "completions/mean_length": 772.169677734375, + "completions/mean_terminated_length": 742.2252197265625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 2.998710343048749, + "grad_norm": 0.6734402179718018, + "kl": 0.1328125, + "learning_rate": 1e-06, + "loss": -0.0523, + "num_tokens": 353016229.0, + "reward": 1.3513394594192505, + "reward_std": 0.22044141590595245, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.35133928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.4162498712539673, + "step": 2906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2167.0, + "completions/max_terminated_length": 2167.0, + "completions/mean_length": 748.3200073242188, + "completions/mean_terminated_length": 748.3200073242188, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 2.99974206860975, + "grad_norm": 0.6013002991676331, + "kl": 0.1295166015625, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 353173097.0, + "reward": 1.3821431398391724, + "reward_std": 0.18830367922782898, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.3821428716182709, + "rewards/curriculum_aware_reward_fn/std": 0.3469855487346649, + "step": 2907 + }, + { + "epoch": 2.99974206860975, + "step": 2907, + "total_flos": 0.0, + "train_loss": 0.015996046145483354, + "train_runtime": 176359.9042, + "train_samples_per_second": 0.264, + "train_steps_per_second": 0.016 + } + ], + "logging_steps": 1, + "max_steps": 2907, + "num_input_tokens_seen": 353173097, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}